mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 12:54:26 +01:00
Remove the vectors from the documents database
This commit is contained in:
parent
7a84697570
commit
84e498299b
2
Cargo.lock
generated
2
Cargo.lock
generated
@ -2455,6 +2455,7 @@ name = "index-scheduler"
|
|||||||
version = "1.9.0"
|
version = "1.9.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"arroy",
|
||||||
"big_s",
|
"big_s",
|
||||||
"bincode",
|
"bincode",
|
||||||
"crossbeam",
|
"crossbeam",
|
||||||
@ -2465,6 +2466,7 @@ dependencies = [
|
|||||||
"file-store",
|
"file-store",
|
||||||
"flate2",
|
"flate2",
|
||||||
"insta",
|
"insta",
|
||||||
|
"maplit",
|
||||||
"meili-snap",
|
"meili-snap",
|
||||||
"meilisearch-auth",
|
"meilisearch-auth",
|
||||||
"meilisearch-types",
|
"meilisearch-types",
|
||||||
|
@ -40,7 +40,9 @@ ureq = "2.9.7"
|
|||||||
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
uuid = { version = "1.6.1", features = ["serde", "v4"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
arroy = "0.3.1"
|
||||||
big_s = "1.0.2"
|
big_s = "1.0.2"
|
||||||
crossbeam = "0.8.4"
|
crossbeam = "0.8.4"
|
||||||
insta = { version = "1.34.0", features = ["json", "redactions"] }
|
insta = { version = "1.34.0", features = ["json", "redactions"] }
|
||||||
|
maplit = "1.0.2"
|
||||||
meili-snap = { path = "../meili-snap" }
|
meili-snap = { path = "../meili-snap" }
|
||||||
|
@ -1459,11 +1459,11 @@ impl IndexScheduler {
|
|||||||
// TODO: consider using a type alias or a struct embedder/template
|
// TODO: consider using a type alias or a struct embedder/template
|
||||||
pub fn embedders(
|
pub fn embedders(
|
||||||
&self,
|
&self,
|
||||||
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>,
|
embedding_configs: Vec<(String, milli::vector::EmbeddingConfig, RoaringBitmap)>,
|
||||||
) -> Result<EmbeddingConfigs> {
|
) -> Result<EmbeddingConfigs> {
|
||||||
let res: Result<_> = embedding_configs
|
let res: Result<_> = embedding_configs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| {
|
.map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt }, _)| {
|
||||||
let prompt =
|
let prompt =
|
||||||
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
|
Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?);
|
||||||
// optimistically return existing embedder
|
// optimistically return existing embedder
|
||||||
@ -1748,6 +1748,9 @@ mod tests {
|
|||||||
use meilisearch_types::milli::update::IndexDocumentsMethod::{
|
use meilisearch_types::milli::update::IndexDocumentsMethod::{
|
||||||
ReplaceDocuments, UpdateDocuments,
|
ReplaceDocuments, UpdateDocuments,
|
||||||
};
|
};
|
||||||
|
use meilisearch_types::milli::update::Setting;
|
||||||
|
use meilisearch_types::milli::vector::settings::EmbeddingSettings;
|
||||||
|
use meilisearch_types::settings::{Checked, Unchecked};
|
||||||
use meilisearch_types::tasks::IndexSwap;
|
use meilisearch_types::tasks::IndexSwap;
|
||||||
use meilisearch_types::VERSION_FILE_NAME;
|
use meilisearch_types::VERSION_FILE_NAME;
|
||||||
use tempfile::{NamedTempFile, TempDir};
|
use tempfile::{NamedTempFile, TempDir};
|
||||||
@ -3052,7 +3055,9 @@ mod tests {
|
|||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
let (_, embedding_config) = configs.first().unwrap();
|
let (name, embedding_config, user_provided) = configs.first().unwrap();
|
||||||
|
insta::assert_snapshot!(name, @"default");
|
||||||
|
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
||||||
insta::assert_json_snapshot!(embedding_config.embedder_options);
|
insta::assert_json_snapshot!(embedding_config.embedder_options);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5017,13 +5022,15 @@ mod tests {
|
|||||||
let configs = index.embedding_configs(&rtxn).unwrap();
|
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
// for consistency with the below
|
// for consistency with the below
|
||||||
#[allow(clippy::get_first)]
|
#[allow(clippy::get_first)]
|
||||||
let (name, fakerest_config) = configs.get(0).unwrap();
|
let (name, fakerest_config, user_provided) = configs.get(0).unwrap();
|
||||||
insta::assert_json_snapshot!(name, @r###""A_fakerest""###);
|
insta::assert_snapshot!(name, @"A_fakerest");
|
||||||
|
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
||||||
insta::assert_json_snapshot!(fakerest_config.embedder_options);
|
insta::assert_json_snapshot!(fakerest_config.embedder_options);
|
||||||
let fakerest_name = name.clone();
|
let fakerest_name = name.clone();
|
||||||
|
|
||||||
let (name, simple_hf_config) = configs.get(1).unwrap();
|
let (name, simple_hf_config, user_provided) = configs.get(1).unwrap();
|
||||||
insta::assert_json_snapshot!(name, @r###""B_small_hf""###);
|
insta::assert_snapshot!(name, @"B_small_hf");
|
||||||
|
insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>");
|
||||||
insta::assert_json_snapshot!(simple_hf_config.embedder_options);
|
insta::assert_json_snapshot!(simple_hf_config.embedder_options);
|
||||||
let simple_hf_name = name.clone();
|
let simple_hf_name = name.clone();
|
||||||
|
|
||||||
@ -5091,6 +5098,18 @@ mod tests {
|
|||||||
let index = index_scheduler.index("doggos").unwrap();
|
let index = index_scheduler.index("doggos").unwrap();
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// Ensure the document have been inserted into the relevant bitamp
|
||||||
|
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
|
// for consistency with the below
|
||||||
|
#[allow(clippy::get_first)]
|
||||||
|
let (name, _config, user_defined) = configs.get(0).unwrap();
|
||||||
|
insta::assert_snapshot!(name, @"A_fakerest");
|
||||||
|
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
|
||||||
|
|
||||||
|
let (name, _config, user_defined) = configs.get(1).unwrap();
|
||||||
|
insta::assert_snapshot!(name, @"B_small_hf");
|
||||||
|
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
|
||||||
|
|
||||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||||
|
|
||||||
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
|
assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true");
|
||||||
@ -5153,6 +5172,18 @@ mod tests {
|
|||||||
let index = index_scheduler.index("doggos").unwrap();
|
let index = index_scheduler.index("doggos").unwrap();
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// Ensure the document have been inserted into the relevant bitamp
|
||||||
|
let configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
|
// for consistency with the below
|
||||||
|
#[allow(clippy::get_first)]
|
||||||
|
let (name, _config, user_defined) = configs.get(0).unwrap();
|
||||||
|
insta::assert_snapshot!(name, @"A_fakerest");
|
||||||
|
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>");
|
||||||
|
|
||||||
|
let (name, _config, user_defined) = configs.get(1).unwrap();
|
||||||
|
insta::assert_snapshot!(name, @"B_small_hf");
|
||||||
|
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[]>");
|
||||||
|
|
||||||
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
let embeddings = index.embeddings(&rtxn, 0).unwrap();
|
||||||
|
|
||||||
// automatically changed to patou
|
// automatically changed to patou
|
||||||
@ -5176,4 +5207,246 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn import_vectors_first_and_embedder_later() {
|
||||||
|
let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]);
|
||||||
|
|
||||||
|
let content = serde_json::json!(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"doggo": "kefir",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"doggo": "intel",
|
||||||
|
"_vectors": {
|
||||||
|
"my_doggo_embedder": vec![1; 384],
|
||||||
|
"unknown embedder": vec![1, 2, 3],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"doggo": "max",
|
||||||
|
"_vectors": {
|
||||||
|
"my_doggo_embedder": {
|
||||||
|
"userProvided": true,
|
||||||
|
"embeddings": vec![2; 384],
|
||||||
|
},
|
||||||
|
"unknown embedder": vec![4, 5],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"doggo": "marcel",
|
||||||
|
"_vectors": {
|
||||||
|
"my_doggo_embedder": {
|
||||||
|
"userProvided": false,
|
||||||
|
"embeddings": vec![3; 384],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"doggo": "sora",
|
||||||
|
"_vectors": {
|
||||||
|
"my_doggo_embedder": {
|
||||||
|
"userProvided": false,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0 as u128).unwrap();
|
||||||
|
let documents_count =
|
||||||
|
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
||||||
|
.unwrap();
|
||||||
|
snapshot!(documents_count, @"5");
|
||||||
|
file.persist().unwrap();
|
||||||
|
|
||||||
|
index_scheduler
|
||||||
|
.register(
|
||||||
|
KindWithContent::DocumentAdditionOrUpdate {
|
||||||
|
index_uid: S("doggos"),
|
||||||
|
primary_key: None,
|
||||||
|
method: ReplaceDocuments,
|
||||||
|
content_file: uuid,
|
||||||
|
documents_count,
|
||||||
|
allow_index_creation: true,
|
||||||
|
},
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
handle.advance_one_successful_batch();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
|
||||||
|
let index = index_scheduler.index("doggos").unwrap();
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
|
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
||||||
|
let documents = index
|
||||||
|
.all_documents(&rtxn)
|
||||||
|
.unwrap()
|
||||||
|
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push");
|
||||||
|
|
||||||
|
let mut setting = meilisearch_types::settings::Settings::<Unchecked>::default();
|
||||||
|
setting.embedders = Setting::Set(maplit::btreemap! {
|
||||||
|
S("my_doggo_embedder") => Setting::Set(EmbeddingSettings {
|
||||||
|
source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace),
|
||||||
|
model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")),
|
||||||
|
revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")),
|
||||||
|
document_template: Setting::Set(S("{{doc.doggo}}")),
|
||||||
|
.. EmbeddingSettings::default()
|
||||||
|
})
|
||||||
|
});
|
||||||
|
index_scheduler
|
||||||
|
.register(
|
||||||
|
KindWithContent::SettingsUpdate {
|
||||||
|
index_uid: S("doggos"),
|
||||||
|
new_settings: Box::new(setting),
|
||||||
|
is_deletion: false,
|
||||||
|
allow_index_creation: false,
|
||||||
|
},
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
handle.advance_one_successful_batch();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
|
||||||
|
let index = index_scheduler.index("doggos").unwrap();
|
||||||
|
let rtxn = index.read_txn().unwrap();
|
||||||
|
let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
|
||||||
|
let field_ids = field_ids_map.ids().collect::<Vec<_>>();
|
||||||
|
let documents = index
|
||||||
|
.all_documents(&rtxn)
|
||||||
|
.unwrap()
|
||||||
|
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
// the all the vectors linked to the new specified embedder have been removed
|
||||||
|
// Only the unknown embedders stays in the document DB
|
||||||
|
snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###);
|
||||||
|
let conf = index.embedding_configs(&rtxn).unwrap();
|
||||||
|
// even though we specified the vector for the ID 3, it shouldn't be marked
|
||||||
|
// as user provided since we explicitely marked it as NOT user provided.
|
||||||
|
snapshot!(format!("{conf:#?}"), @r###"
|
||||||
|
[
|
||||||
|
(
|
||||||
|
"my_doggo_embedder",
|
||||||
|
EmbeddingConfig {
|
||||||
|
embedder_options: HuggingFace(
|
||||||
|
EmbedderOptions {
|
||||||
|
model: "sentence-transformers/all-MiniLM-L6-v2",
|
||||||
|
revision: Some(
|
||||||
|
"e4ce9877abf3edfe10b0d82785e83bdcb973e22e",
|
||||||
|
),
|
||||||
|
distribution: None,
|
||||||
|
},
|
||||||
|
),
|
||||||
|
prompt: PromptData {
|
||||||
|
template: "{{doc.doggo}}",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
RoaringBitmap<[1, 2]>,
|
||||||
|
),
|
||||||
|
]
|
||||||
|
"###);
|
||||||
|
let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap();
|
||||||
|
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
|
let embedding = &embeddings["my_doggo_embedder"];
|
||||||
|
assert!(!embedding.is_empty(), "{embedding:?}");
|
||||||
|
|
||||||
|
// the document with the id 3 should keep its original embedding
|
||||||
|
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
||||||
|
let mut embeddings = Vec::new();
|
||||||
|
|
||||||
|
'vectors: for i in 0..=u8::MAX {
|
||||||
|
let reader = arroy::Reader::open(&rtxn, 0 | (i as u16), index.vector_arroy)
|
||||||
|
.map(Some)
|
||||||
|
.or_else(|e| match e {
|
||||||
|
arroy::Error::MissingMetadata => Ok(None),
|
||||||
|
e => Err(e),
|
||||||
|
})
|
||||||
|
.transpose();
|
||||||
|
|
||||||
|
let Some(reader) = reader else {
|
||||||
|
break 'vectors;
|
||||||
|
};
|
||||||
|
|
||||||
|
let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap();
|
||||||
|
if let Some(embedding) = embedding {
|
||||||
|
embeddings.push(embedding)
|
||||||
|
} else {
|
||||||
|
break 'vectors;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
snapshot!(embeddings.len(), @"1");
|
||||||
|
assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]);
|
||||||
|
|
||||||
|
// If we update marcel it should regenerate its embedding automatically
|
||||||
|
|
||||||
|
let content = serde_json::json!(
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"doggo": "marvel",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"doggo": "sorry",
|
||||||
|
},
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1 as u128).unwrap();
|
||||||
|
let documents_count =
|
||||||
|
read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file)
|
||||||
|
.unwrap();
|
||||||
|
snapshot!(documents_count, @"2");
|
||||||
|
file.persist().unwrap();
|
||||||
|
|
||||||
|
index_scheduler
|
||||||
|
.register(
|
||||||
|
KindWithContent::DocumentAdditionOrUpdate {
|
||||||
|
index_uid: S("doggos"),
|
||||||
|
primary_key: None,
|
||||||
|
method: UpdateDocuments,
|
||||||
|
content_file: uuid,
|
||||||
|
documents_count,
|
||||||
|
allow_index_creation: true,
|
||||||
|
},
|
||||||
|
None,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
handle.advance_one_successful_batch();
|
||||||
|
index_scheduler.assert_internally_consistent();
|
||||||
|
|
||||||
|
// the document with the id 3 should have its original embedding updated
|
||||||
|
let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap();
|
||||||
|
let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
|
let embedding = &embeddings["my_doggo_embedder"];
|
||||||
|
|
||||||
|
assert!(!embedding.is_empty());
|
||||||
|
/// TODO: it shouldn’t be equal to 3.0
|
||||||
|
assert!(embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]);
|
||||||
|
|
||||||
|
// the document with the id 4 should generate an embedding
|
||||||
|
// let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap();
|
||||||
|
// let embeddings = index.embeddings(&rtxn, docid).unwrap();
|
||||||
|
// dbg!(&embeddings);
|
||||||
|
// let embedding = &embeddings["my_doggo_embedder"];
|
||||||
|
|
||||||
|
// assert!(!embedding.is_empty());
|
||||||
|
// assert!(embedding[0]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
File diff suppressed because one or more lines are too long
@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
source: index-scheduler/src/lib.rs
|
||||||
|
---
|
||||||
|
[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]
|
@ -672,7 +672,7 @@ pub fn settings(
|
|||||||
let embedders: BTreeMap<_, _> = index
|
let embedders: BTreeMap<_, _> = index
|
||||||
.embedding_configs(rtxn)?
|
.embedding_configs(rtxn)?
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(name, config)| (name, Setting::Set(config.into())))
|
.map(|(name, config, _)| (name, Setting::Set(config.into())))
|
||||||
.collect();
|
.collect();
|
||||||
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) };
|
||||||
|
|
||||||
|
@ -44,7 +44,7 @@ once_cell = "1.19.0"
|
|||||||
ordered-float = "4.2.0"
|
ordered-float = "4.2.0"
|
||||||
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
rand_pcg = { version = "0.3.1", features = ["serde1"] }
|
||||||
rayon = "1.8.0"
|
rayon = "1.8.0"
|
||||||
roaring = "0.10.2"
|
roaring = { version = "0.10.2", features = ["serde"] }
|
||||||
rstar = { version = "0.11.0", features = ["serde"] }
|
rstar = { version = "0.11.0", features = ["serde"] }
|
||||||
serde = { version = "1.0.195", features = ["derive"] }
|
serde = { version = "1.0.195", features = ["derive"] }
|
||||||
serde_json = { version = "1.0.111", features = ["preserve_order"] }
|
serde_json = { version = "1.0.111", features = ["preserve_order"] }
|
||||||
|
@ -1572,16 +1572,18 @@ impl Index {
|
|||||||
Ok(script_language)
|
Ok(script_language)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Put the embedding configs:
|
||||||
|
/// 1. The name of the embedder
|
||||||
|
/// 2. The configuration option for this embedder
|
||||||
|
/// 3. The list of documents with a user provided embedding
|
||||||
pub(crate) fn put_embedding_configs(
|
pub(crate) fn put_embedding_configs(
|
||||||
&self,
|
&self,
|
||||||
wtxn: &mut RwTxn<'_>,
|
wtxn: &mut RwTxn<'_>,
|
||||||
configs: Vec<(String, EmbeddingConfig)>,
|
configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
self.main.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>().put(
|
self.main
|
||||||
wtxn,
|
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
|
||||||
main_key::EMBEDDING_CONFIGS,
|
.put(wtxn, main_key::EMBEDDING_CONFIGS, &configs)
|
||||||
&configs,
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<bool> {
|
||||||
@ -1591,10 +1593,10 @@ impl Index {
|
|||||||
pub fn embedding_configs(
|
pub fn embedding_configs(
|
||||||
&self,
|
&self,
|
||||||
rtxn: &RoTxn<'_>,
|
rtxn: &RoTxn<'_>,
|
||||||
) -> Result<Vec<(String, crate::vector::EmbeddingConfig)>> {
|
) -> Result<Vec<(String, EmbeddingConfig, RoaringBitmap)>> {
|
||||||
Ok(self
|
Ok(self
|
||||||
.main
|
.main
|
||||||
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig)>>>()
|
.remap_types::<Str, SerdeJson<Vec<(String, EmbeddingConfig, RoaringBitmap)>>>()
|
||||||
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
.get(rtxn, main_key::EMBEDDING_CONFIGS)?
|
||||||
.unwrap_or_default())
|
.unwrap_or_default())
|
||||||
}
|
}
|
||||||
|
@ -10,16 +10,16 @@ use bytemuck::cast_slice;
|
|||||||
use grenad::Writer;
|
use grenad::Writer;
|
||||||
use itertools::EitherOrBoth;
|
use itertools::EitherOrBoth;
|
||||||
use ordered_float::OrderedFloat;
|
use ordered_float::OrderedFloat;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
|
||||||
use crate::prompt::Prompt;
|
use crate::prompt::Prompt;
|
||||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd};
|
||||||
use crate::update::index_documents::helpers::try_split_at;
|
|
||||||
use crate::update::settings::InnerIndexSettingsDiff;
|
use crate::update::settings::InnerIndexSettingsDiff;
|
||||||
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
|
use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME};
|
||||||
use crate::vector::Embedder;
|
use crate::vector::Embedder;
|
||||||
use crate::{DocumentId, Result, ThreadPoolNoAbort};
|
use crate::{try_split_array_at, DocumentId, Result, ThreadPoolNoAbort};
|
||||||
|
|
||||||
/// The length of the elements that are always in the buffer when inserting new values.
|
/// The length of the elements that are always in the buffer when inserting new values.
|
||||||
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
const TRUNCATE_SIZE: usize = size_of::<DocumentId>();
|
||||||
@ -35,6 +35,8 @@ pub struct ExtractedVectorPoints {
|
|||||||
// embedder
|
// embedder
|
||||||
pub embedder_name: String,
|
pub embedder_name: String,
|
||||||
pub embedder: Arc<Embedder>,
|
pub embedder: Arc<Embedder>,
|
||||||
|
pub user_defined: RoaringBitmap,
|
||||||
|
pub remove_from_user_defined: RoaringBitmap,
|
||||||
}
|
}
|
||||||
|
|
||||||
enum VectorStateDelta {
|
enum VectorStateDelta {
|
||||||
@ -80,6 +82,11 @@ struct EmbedderVectorExtractor {
|
|||||||
prompts_writer: Writer<BufWriter<File>>,
|
prompts_writer: Writer<BufWriter<File>>,
|
||||||
// (docid) -> ()
|
// (docid) -> ()
|
||||||
remove_vectors_writer: Writer<BufWriter<File>>,
|
remove_vectors_writer: Writer<BufWriter<File>>,
|
||||||
|
|
||||||
|
// The docids of the documents that contains a user defined embedding
|
||||||
|
user_defined: RoaringBitmap,
|
||||||
|
// The docids of the documents that contains an auto-generated embedding
|
||||||
|
remove_from_user_defined: RoaringBitmap,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
/// Extracts the embedding vector contained in each document under the `_vectors` field.
|
||||||
@ -134,6 +141,8 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
manual_vectors_writer,
|
manual_vectors_writer,
|
||||||
prompts_writer,
|
prompts_writer,
|
||||||
remove_vectors_writer,
|
remove_vectors_writer,
|
||||||
|
user_defined: RoaringBitmap::new(),
|
||||||
|
remove_from_user_defined: RoaringBitmap::new(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -141,13 +150,15 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
// this must always be serialized as (docid, external_docid);
|
// this must always be serialized as (docid, external_docid);
|
||||||
|
const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::<DocumentId>();
|
||||||
let (docid_bytes, external_id_bytes) =
|
let (docid_bytes, external_id_bytes) =
|
||||||
try_split_at(key, std::mem::size_of::<DocumentId>()).unwrap();
|
try_split_array_at::<u8, SIZE_OF_DOCUMENTID>(key).unwrap();
|
||||||
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
debug_assert!(from_utf8(external_id_bytes).is_ok());
|
||||||
|
let docid = DocumentId::from_be_bytes(docid_bytes);
|
||||||
|
|
||||||
let obkv = obkv::KvReader::new(value);
|
let obkv = obkv::KvReader::new(value);
|
||||||
key_buffer.clear();
|
key_buffer.clear();
|
||||||
key_buffer.extend_from_slice(docid_bytes);
|
key_buffer.extend_from_slice(docid_bytes.as_slice());
|
||||||
|
|
||||||
// since we only need the primary key when we throw an error we create this getter to
|
// since we only need the primary key when we throw an error we create this getter to
|
||||||
// lazily get it when needed
|
// lazily get it when needed
|
||||||
@ -163,10 +174,22 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
manual_vectors_writer,
|
manual_vectors_writer,
|
||||||
prompts_writer,
|
prompts_writer,
|
||||||
remove_vectors_writer,
|
remove_vectors_writer,
|
||||||
|
user_defined,
|
||||||
|
remove_from_user_defined,
|
||||||
} in extractors.iter_mut()
|
} in extractors.iter_mut()
|
||||||
{
|
{
|
||||||
let delta = match parsed_vectors.remove(embedder_name) {
|
let delta = match parsed_vectors.remove(embedder_name) {
|
||||||
(Some(old), Some(new)) => {
|
(Some(old), Some(new)) => {
|
||||||
|
match (old.is_user_provided(), new.is_user_provided()) {
|
||||||
|
(true, true) | (false, false) => (),
|
||||||
|
(true, false) => {
|
||||||
|
remove_from_user_defined.insert(docid);
|
||||||
|
}
|
||||||
|
(false, true) => {
|
||||||
|
user_defined.insert(docid);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// no autogeneration
|
// no autogeneration
|
||||||
let del_vectors = old.into_array_of_vectors();
|
let del_vectors = old.into_array_of_vectors();
|
||||||
let add_vectors = new.into_array_of_vectors();
|
let add_vectors = new.into_array_of_vectors();
|
||||||
@ -187,6 +210,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
.map(|(_, deladd)| KvReaderDelAdd::new(deladd))
|
||||||
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
.any(|deladd| deladd.get(DelAdd::Addition).is_some());
|
||||||
if document_is_kept {
|
if document_is_kept {
|
||||||
|
remove_from_user_defined.insert(docid);
|
||||||
// becomes autogenerated
|
// becomes autogenerated
|
||||||
VectorStateDelta::NowGenerated(prompt.render(
|
VectorStateDelta::NowGenerated(prompt.render(
|
||||||
obkv,
|
obkv,
|
||||||
@ -198,6 +222,11 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
(None, Some(new)) => {
|
(None, Some(new)) => {
|
||||||
|
if new.is_user_provided() {
|
||||||
|
user_defined.insert(docid);
|
||||||
|
} else {
|
||||||
|
remove_from_user_defined.insert(docid);
|
||||||
|
}
|
||||||
// was possibly autogenerated, remove all vectors for that document
|
// was possibly autogenerated, remove all vectors for that document
|
||||||
let add_vectors = new.into_array_of_vectors();
|
let add_vectors = new.into_array_of_vectors();
|
||||||
if add_vectors.len() > usize::from(u8::MAX) {
|
if add_vectors.len() > usize::from(u8::MAX) {
|
||||||
@ -239,6 +268,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
VectorStateDelta::NoChange
|
VectorStateDelta::NoChange
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
remove_from_user_defined.remove(docid);
|
||||||
VectorStateDelta::NowRemoved
|
VectorStateDelta::NowRemoved
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -265,18 +295,18 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
|
|||||||
manual_vectors_writer,
|
manual_vectors_writer,
|
||||||
prompts_writer,
|
prompts_writer,
|
||||||
remove_vectors_writer,
|
remove_vectors_writer,
|
||||||
|
user_defined,
|
||||||
|
remove_from_user_defined,
|
||||||
} in extractors
|
} in extractors
|
||||||
{
|
{
|
||||||
results.push(ExtractedVectorPoints {
|
results.push(ExtractedVectorPoints {
|
||||||
// docid, _index -> KvWriterDelAdd -> Vector
|
|
||||||
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
manual_vectors: writer_into_reader(manual_vectors_writer)?,
|
||||||
// docid -> ()
|
|
||||||
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
remove_vectors: writer_into_reader(remove_vectors_writer)?,
|
||||||
// docid -> prompt
|
|
||||||
prompts: writer_into_reader(prompts_writer)?,
|
prompts: writer_into_reader(prompts_writer)?,
|
||||||
|
|
||||||
embedder,
|
embedder,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
|
user_defined,
|
||||||
|
remove_from_user_defined,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -238,6 +238,8 @@ fn send_original_documents_data(
|
|||||||
prompts,
|
prompts,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
embedder,
|
embedder,
|
||||||
|
user_defined,
|
||||||
|
remove_from_user_defined: auto_generated,
|
||||||
} in extracted_vectors
|
} in extracted_vectors
|
||||||
{
|
{
|
||||||
let embeddings = match extract_embeddings(
|
let embeddings = match extract_embeddings(
|
||||||
@ -262,6 +264,8 @@ fn send_original_documents_data(
|
|||||||
expected_dimension: embedder.dimensions(),
|
expected_dimension: embedder.dimensions(),
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
|
user_defined,
|
||||||
|
remove_from_user_defined: auto_generated,
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -501,6 +501,8 @@ where
|
|||||||
embeddings,
|
embeddings,
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
|
user_defined,
|
||||||
|
remove_from_user_defined,
|
||||||
} => {
|
} => {
|
||||||
dimension.insert(embedder_name.clone(), expected_dimension);
|
dimension.insert(embedder_name.clone(), expected_dimension);
|
||||||
TypedChunk::VectorPoints {
|
TypedChunk::VectorPoints {
|
||||||
@ -509,6 +511,8 @@ where
|
|||||||
expected_dimension,
|
expected_dimension,
|
||||||
manual_vectors,
|
manual_vectors,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
|
user_defined,
|
||||||
|
remove_from_user_defined,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
otherwise => otherwise,
|
otherwise => otherwise,
|
||||||
@ -2616,10 +2620,11 @@ mod tests {
|
|||||||
|
|
||||||
let rtxn = index.read_txn().unwrap();
|
let rtxn = index.read_txn().unwrap();
|
||||||
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
|
let mut embedding_configs = index.embedding_configs(&rtxn).unwrap();
|
||||||
let (embedder_name, embedder) = embedding_configs.pop().unwrap();
|
let (embedder_name, embedder, user_defined) = embedding_configs.pop().unwrap();
|
||||||
|
insta::assert_snapshot!(embedder_name, @"manual");
|
||||||
|
insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0, 1, 2]>");
|
||||||
let embedder =
|
let embedder =
|
||||||
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
|
std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap());
|
||||||
assert_eq!("manual", embedder_name);
|
|
||||||
let res = index
|
let res = index
|
||||||
.search(&rtxn)
|
.search(&rtxn)
|
||||||
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))
|
.semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec()))
|
||||||
|
@ -90,6 +90,8 @@ pub(crate) enum TypedChunk {
|
|||||||
expected_dimension: usize,
|
expected_dimension: usize,
|
||||||
manual_vectors: grenad::Reader<BufReader<File>>,
|
manual_vectors: grenad::Reader<BufReader<File>>,
|
||||||
embedder_name: String,
|
embedder_name: String,
|
||||||
|
user_defined: RoaringBitmap,
|
||||||
|
remove_from_user_defined: RoaringBitmap,
|
||||||
},
|
},
|
||||||
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>),
|
||||||
}
|
}
|
||||||
@ -155,7 +157,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let mut iter = merger.into_stream_merger_iter()?;
|
let mut iter = merger.into_stream_merger_iter()?;
|
||||||
|
|
||||||
let embedders: BTreeSet<_> =
|
let embedders: BTreeSet<_> =
|
||||||
index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect();
|
index.embedding_configs(wtxn)?.into_iter().map(|(name, _, _)| name).collect();
|
||||||
let mut vectors_buffer = Vec::new();
|
let mut vectors_buffer = Vec::new();
|
||||||
while let Some((key, reader)) = iter.next()? {
|
while let Some((key, reader)) = iter.next()? {
|
||||||
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
|
||||||
@ -181,7 +183,7 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
// if the `_vectors` field cannot be parsed as map of vectors, just write it as-is
|
||||||
break 'vectors Some(addition);
|
break 'vectors Some(addition);
|
||||||
};
|
};
|
||||||
vectors.retain_user_provided_vectors(&embedders);
|
vectors.retain_not_embedded_vectors(&embedders);
|
||||||
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors;
|
||||||
if vectors.is_empty() {
|
if vectors.is_empty() {
|
||||||
// skip writing empty `_vectors` map
|
// skip writing empty `_vectors` map
|
||||||
@ -619,6 +621,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn);
|
||||||
|
let mut user_defined = RoaringBitmap::new();
|
||||||
|
let mut remove_from_user_defined = RoaringBitmap::new();
|
||||||
let mut params = None;
|
let mut params = None;
|
||||||
for typed_chunk in typed_chunks {
|
for typed_chunk in typed_chunks {
|
||||||
let TypedChunk::VectorPoints {
|
let TypedChunk::VectorPoints {
|
||||||
@ -627,6 +631,8 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
embeddings,
|
embeddings,
|
||||||
expected_dimension,
|
expected_dimension,
|
||||||
embedder_name,
|
embedder_name,
|
||||||
|
user_defined: ud,
|
||||||
|
remove_from_user_defined: rud,
|
||||||
} = typed_chunk
|
} = typed_chunk
|
||||||
else {
|
else {
|
||||||
unreachable!();
|
unreachable!();
|
||||||
@ -639,11 +645,21 @@ pub(crate) fn write_typed_chunk_into_index(
|
|||||||
if let Some(embeddings) = embeddings {
|
if let Some(embeddings) = embeddings {
|
||||||
embeddings_builder.push(embeddings.into_cursor()?);
|
embeddings_builder.push(embeddings.into_cursor()?);
|
||||||
}
|
}
|
||||||
|
user_defined |= ud;
|
||||||
|
remove_from_user_defined |= rud;
|
||||||
}
|
}
|
||||||
|
|
||||||
// typed chunks has always at least 1 chunk.
|
// typed chunks has always at least 1 chunk.
|
||||||
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
let Some((expected_dimension, embedder_name)) = params else { unreachable!() };
|
||||||
|
|
||||||
|
let mut embedding_configs = index.embedding_configs(&wtxn)?;
|
||||||
|
let (_name, _conf, ud) =
|
||||||
|
embedding_configs.iter_mut().find(|config| config.0 == embedder_name).unwrap();
|
||||||
|
*ud -= remove_from_user_defined;
|
||||||
|
*ud |= user_defined;
|
||||||
|
|
||||||
|
index.put_embedding_configs(wtxn, embedding_configs)?;
|
||||||
|
|
||||||
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or(
|
||||||
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None },
|
||||||
)?;
|
)?;
|
||||||
|
@ -6,6 +6,7 @@ use std::sync::Arc;
|
|||||||
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
use charabia::{Normalize, Tokenizer, TokenizerBuilder};
|
||||||
use deserr::{DeserializeError, Deserr};
|
use deserr::{DeserializeError, Deserr};
|
||||||
use itertools::{EitherOrBoth, Itertools};
|
use itertools::{EitherOrBoth, Itertools};
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
@ -926,8 +927,13 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
Setting::Set(configs) => {
|
Setting::Set(configs) => {
|
||||||
let mut changed = false;
|
let mut changed = false;
|
||||||
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
let old_configs = self.index.embedding_configs(self.wtxn)?;
|
||||||
let old_configs: BTreeMap<String, Setting<EmbeddingSettings>> =
|
let old_configs: BTreeMap<String, (Setting<EmbeddingSettings>, RoaringBitmap)> =
|
||||||
old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect();
|
old_configs
|
||||||
|
.into_iter()
|
||||||
|
.map(|(name, setting, user_defined)| {
|
||||||
|
(name, (Setting::Set(setting.into()), user_defined))
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
let mut new_configs = BTreeMap::new();
|
let mut new_configs = BTreeMap::new();
|
||||||
for joined in old_configs
|
for joined in old_configs
|
||||||
@ -936,15 +942,19 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
{
|
{
|
||||||
match joined {
|
match joined {
|
||||||
// updated config
|
// updated config
|
||||||
EitherOrBoth::Both((name, mut old), (_, new)) => {
|
EitherOrBoth::Both((name, (mut old, user_defined)), (_, new)) => {
|
||||||
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
|
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new);
|
||||||
if changed {
|
if changed {
|
||||||
tracing::debug!(embedder = name, "need reindex");
|
tracing::debug!(
|
||||||
|
embedder = name,
|
||||||
|
documents = user_defined.len(),
|
||||||
|
"need reindex"
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
tracing::debug!(embedder = name, "skip reindex");
|
tracing::debug!(embedder = name, "skip reindex");
|
||||||
}
|
}
|
||||||
let new = validate_embedding_settings(old, &name)?;
|
let new = validate_embedding_settings(old, &name)?;
|
||||||
new_configs.insert(name, new);
|
new_configs.insert(name, (new, user_defined));
|
||||||
}
|
}
|
||||||
// unchanged config
|
// unchanged config
|
||||||
EitherOrBoth::Left((name, setting)) => {
|
EitherOrBoth::Left((name, setting)) => {
|
||||||
@ -961,21 +971,23 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
|
|||||||
);
|
);
|
||||||
let setting = validate_embedding_settings(setting, &name)?;
|
let setting = validate_embedding_settings(setting, &name)?;
|
||||||
changed = true;
|
changed = true;
|
||||||
new_configs.insert(name, setting);
|
new_configs.insert(name, (setting, RoaringBitmap::new()));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let new_configs: Vec<(String, EmbeddingConfig)> = new_configs
|
let new_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)> = new_configs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.filter_map(|(name, setting)| match setting {
|
.filter_map(|(name, (setting, user_defined))| match setting {
|
||||||
Setting::Set(value) => Some((name, value.into())),
|
Setting::Set(settings) => Some((name, settings.into(), user_defined)),
|
||||||
Setting::Reset => None,
|
Setting::Reset => None,
|
||||||
Setting::NotSet => Some((name, EmbeddingSettings::default().into())),
|
Setting::NotSet => {
|
||||||
|
Some((name, EmbeddingSettings::default().into(), user_defined))
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
self.index.embedder_category_id.clear(self.wtxn)?;
|
self.index.embedder_category_id.clear(self.wtxn)?;
|
||||||
for (index, (embedder_name, _)) in new_configs.iter().enumerate() {
|
for (index, (embedder_name, _, _)) in new_configs.iter().enumerate() {
|
||||||
self.index.embedder_category_id.put_with_flags(
|
self.index.embedder_category_id.put_with_flags(
|
||||||
self.wtxn,
|
self.wtxn,
|
||||||
heed::PutFlags::APPEND,
|
heed::PutFlags::APPEND,
|
||||||
@ -1359,10 +1371,12 @@ impl InnerIndexSettings {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result<EmbeddingConfigs> {
|
fn embedders(
|
||||||
|
embedding_configs: Vec<(String, EmbeddingConfig, RoaringBitmap)>,
|
||||||
|
) -> Result<EmbeddingConfigs> {
|
||||||
let res: Result<_> = embedding_configs
|
let res: Result<_> = embedding_configs
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.map(|(name, EmbeddingConfig { embedder_options, prompt })| {
|
.map(|(name, EmbeddingConfig { embedder_options, prompt }, _)| {
|
||||||
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?);
|
||||||
|
|
||||||
let embedder = Arc::new(
|
let embedder = Arc::new(
|
||||||
|
@ -17,6 +17,13 @@ pub enum Vectors {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Vectors {
|
impl Vectors {
|
||||||
|
pub fn is_user_provided(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
Vectors::ImplicitlyUserProvided(_) => true,
|
||||||
|
Vectors::Explicit(ExplicitVectors { user_provided, .. }) => *user_provided,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
|
pub fn into_array_of_vectors(self) -> Vec<Embedding> {
|
||||||
match self {
|
match self {
|
||||||
Vectors::ImplicitlyUserProvided(embeddings)
|
Vectors::ImplicitlyUserProvided(embeddings)
|
||||||
@ -89,15 +96,8 @@ impl ParsedVectors {
|
|||||||
Ok(ParsedVectors(value))
|
Ok(ParsedVectors(value))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet<String>) {
|
pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet<String>) {
|
||||||
self.0.retain(|k, v| match v {
|
self.0.retain(|k, _v| !embedders.contains(k))
|
||||||
Vectors::ImplicitlyUserProvided(_) => true,
|
|
||||||
Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => {
|
|
||||||
*user_provided
|
|
||||||
// if the embedder is not in the config, then never touch it
|
|
||||||
|| !embedders.contains(k)
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user