diff --git a/Cargo.lock b/Cargo.lock index 4417af63a..2a5960502 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2455,6 +2455,7 @@ name = "index-scheduler" version = "1.9.0" dependencies = [ "anyhow", + "arroy", "big_s", "bincode", "crossbeam", @@ -2465,6 +2466,7 @@ dependencies = [ "file-store", "flate2", "insta", + "maplit", "meili-snap", "meilisearch-auth", "meilisearch-types", @@ -5301,9 +5303,9 @@ dependencies = [ [[package]] name = "tracing-actix-web" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa069bd1503dd526ee793bb3fce408895136c95fc86d2edb2acf1c646d7f0684" +checksum = "4ee9e39a66d9b615644893ffc1704d2a89b5b315b7fd0228ad3182ca9a306b19" dependencies = [ "actix-web", "mutually_exclusive_features", diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap index 0aad0ea97..a9c76227a 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-6.snap @@ -780,7 +780,7 @@ expression: document 1.3484878540039063 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap index f2a5e1d69..e5d28e450 100644 --- a/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap +++ b/dump/src/reader/snapshots/dump__reader__test__import_dump_v6_with_vectors-7.snap @@ -779,7 +779,7 @@ expression: document 1.04031240940094 ] ], - "userProvided": false + "regenerate": true } } } diff --git a/dump/src/reader/v3/settings.rs b/dump/src/reader/v3/settings.rs index 0027bf4ff..3288bb1e7 100644 --- a/dump/src/reader/v3/settings.rs +++ b/dump/src/reader/v3/settings.rs @@ -152,6 +152,7 @@ impl Settings { } #[derive(Debug, Clone, Deserialize)] +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[cfg_attr(test, derive(serde::Serialize))] #[serde(deny_unknown_fields)] #[serde(rename_all = "camelCase")] diff --git a/dump/src/reader/v4/settings.rs b/dump/src/reader/v4/settings.rs index 964cd1152..78d9118ff 100644 --- a/dump/src/reader/v4/settings.rs +++ b/dump/src/reader/v4/settings.rs @@ -182,6 +182,7 @@ impl Settings { } } +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[derive(Debug, Clone, Deserialize)] #[cfg_attr(test, derive(serde::Serialize))] #[serde(deny_unknown_fields)] diff --git a/dump/src/reader/v5/tasks.rs b/dump/src/reader/v5/tasks.rs index 528a870fc..8dfb2d0b0 100644 --- a/dump/src/reader/v5/tasks.rs +++ b/dump/src/reader/v5/tasks.rs @@ -200,6 +200,7 @@ impl std::ops::Deref for IndexUid { } } +#[allow(dead_code)] // otherwise rustc complains that the fields go unused #[derive(Debug)] #[cfg_attr(test, derive(serde::Serialize))] #[cfg_attr(test, serde(rename_all = "camelCase"))] diff --git a/dump/tests/assets/v6-with-vectors.dump b/dump/tests/assets/v6-with-vectors.dump index 9f8ed2ba1..8c0505772 100644 Binary files a/dump/tests/assets/v6-with-vectors.dump and b/dump/tests/assets/v6-with-vectors.dump differ diff --git a/index-scheduler/Cargo.toml b/index-scheduler/Cargo.toml index 21fa34733..8959bb070 100644 --- a/index-scheduler/Cargo.toml +++ b/index-scheduler/Cargo.toml @@ -40,7 +40,9 @@ ureq = "2.9.7" uuid = { version = "1.6.1", features = ["serde", "v4"] } [dev-dependencies] +arroy = "0.3.1" big_s = "1.0.2" crossbeam = "0.8.4" insta = { version = "1.34.0", features = ["json", "redactions"] } +maplit = "1.0.2" meili-snap = { path = "../meili-snap" } diff --git a/index-scheduler/src/batch.rs b/index-scheduler/src/batch.rs index 181ac49a3..cd5525eea 100644 --- a/index-scheduler/src/batch.rs +++ b/index-scheduler/src/batch.rs @@ -909,6 +909,7 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(&rtxn)?; // 3.1. Dump the documents for ret in index.all_documents(&rtxn)? { @@ -951,16 +952,21 @@ impl IndexScheduler { }; for (embedder_name, embeddings) in embeddings { - // don't change the entry if it already exists, because it was user-provided - vectors.entry(embedder_name).or_insert_with(|| { - let embeddings = ExplicitVectors { - embeddings: VectorOrArrayOfVectors::from_array_of_vectors( - embeddings, - ), - user_provided: false, - }; - serde_json::to_value(embeddings).unwrap() - }); + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == embedder_name) + .is_some_and(|conf| conf.user_provided.contains(id)); + + let embeddings = ExplicitVectors { + embeddings: Some( + VectorOrArrayOfVectors::from_array_of_vectors(embeddings), + ), + regenerate: !user_provided, + }; + vectors.insert( + embedder_name, + serde_json::to_value(embeddings).unwrap(), + ); } } diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 8a1c2f540..88997b715 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -53,6 +53,7 @@ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{SerdeBincode, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, PutFlags, RoTxn, RwTxn}; use meilisearch_types::milli::documents::DocumentsBatchBuilder; +use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; use meilisearch_types::milli::{self, CboRoaringBitmapCodec, Index, RoaringBitmapCodec, BEU32}; @@ -1459,33 +1460,39 @@ impl IndexScheduler { // TODO: consider using a type alias or a struct embedder/template pub fn embedders( &self, - embedding_configs: Vec<(String, milli::vector::EmbeddingConfig)>, + embedding_configs: Vec, ) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, milli::vector::EmbeddingConfig { embedder_options, prompt })| { - let prompt = - Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); - // optimistically return existing embedder - { - let embedders = self.embedders.read().unwrap(); - if let Some(embedder) = embedders.get(&embedder_options) { - return Ok((name, (embedder.clone(), prompt))); + .map( + |IndexEmbeddingConfig { + name, + config: milli::vector::EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = + Arc::new(prompt.try_into().map_err(meilisearch_types::milli::Error::from)?); + // optimistically return existing embedder + { + let embedders = self.embedders.read().unwrap(); + if let Some(embedder) = embedders.get(&embedder_options) { + return Ok((name, (embedder.clone(), prompt))); + } } - } - // add missing embedder - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(meilisearch_types::milli::vector::Error::from) - .map_err(meilisearch_types::milli::Error::from)?, - ); - { - let mut embedders = self.embedders.write().unwrap(); - embedders.insert(embedder_options, embedder.clone()); - } - Ok((name, (embedder, prompt))) - }) + // add missing embedder + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(meilisearch_types::milli::vector::Error::from) + .map_err(meilisearch_types::milli::Error::from)?, + ); + { + let mut embedders = self.embedders.write().unwrap(); + embedders.insert(embedder_options, embedder.clone()); + } + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } @@ -1748,6 +1755,9 @@ mod tests { use meilisearch_types::milli::update::IndexDocumentsMethod::{ ReplaceDocuments, UpdateDocuments, }; + use meilisearch_types::milli::update::Setting; + use meilisearch_types::milli::vector::settings::EmbeddingSettings; + use meilisearch_types::settings::Unchecked; use meilisearch_types::tasks::IndexSwap; use meilisearch_types::VERSION_FILE_NAME; use tempfile::{NamedTempFile, TempDir}; @@ -1826,6 +1836,7 @@ mod tests { assert_eq!(breakpoint, (Init, false)); let index_scheduler_handle = IndexSchedulerHandle { _tempdir: tempdir, + index_scheduler: index_scheduler.private_clone(), test_breakpoint_rcv: receiver, last_breakpoint: breakpoint.0, }; @@ -1914,6 +1925,7 @@ mod tests { pub struct IndexSchedulerHandle { _tempdir: TempDir, + index_scheduler: IndexScheduler, test_breakpoint_rcv: crossbeam::channel::Receiver<(Breakpoint, bool)>, last_breakpoint: Breakpoint, } @@ -1931,9 +1943,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; // if we've already encountered a breakpoint we're supposed to be stuck on the false // and we expect the same variant with the true to come now. @@ -1952,9 +1968,13 @@ mod tests { { Ok(b) => b, Err(RecvTimeoutError::Timeout) => { - panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.") + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler seems to be waiting for a new task while your test is waiting for a breakpoint.\n{state}") + } + Err(RecvTimeoutError::Disconnected) => { + let state = snapshot_index_scheduler(&self.index_scheduler); + panic!("The scheduler crashed.\n{state}") } - Err(RecvTimeoutError::Disconnected) => panic!("The scheduler crashed."), }; assert!(!b, "Found the breakpoint handle in a bad state. Check your test suite"); @@ -1968,9 +1988,10 @@ mod tests { fn advance_till(&mut self, breakpoints: impl IntoIterator) { for breakpoint in breakpoints { let b = self.advance(); + let state = snapshot_index_scheduler(&self.index_scheduler); assert_eq!( b, breakpoint, - "Was expecting the breakpoint `{:?}` but instead got `{:?}`.", + "Was expecting the breakpoint `{:?}` but instead got `{:?}`.\n{state}", breakpoint, b ); } @@ -1995,6 +2016,7 @@ mod tests { // Wait for one successful batch. #[track_caller] fn advance_one_successful_batch(&mut self) { + self.index_scheduler.assert_internally_consistent(); self.advance_till([Start, BatchCreated]); loop { match self.advance() { @@ -2003,13 +2025,17 @@ mod tests { InsideProcessBatch => (), // the batch went successfully, we can stop the loop and go on with the next states. ProcessBatchSucceeded => break, - AbortedIndexation => panic!("The batch was aborted."), - ProcessBatchFailed => panic!("The batch failed."), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), + ProcessBatchFailed => { + while self.advance() != Start {} + panic!("The batch failed.\n{}", snapshot_index_scheduler(&self.index_scheduler)) + }, breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } self.advance_till([AfterProcessing]); + self.index_scheduler.assert_internally_consistent(); } // Wait for one failed batch. @@ -2023,8 +2049,8 @@ mod tests { InsideProcessBatch => (), // the batch went failed, we can stop the loop and go on with the next states. ProcessBatchFailed => break, - ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)"), - AbortedIndexation => panic!("The batch was aborted."), + ProcessBatchSucceeded => panic!("The batch succeeded. (and it wasn't supposed to sorry)\n{}", snapshot_index_scheduler(&self.index_scheduler)), + AbortedIndexation => panic!("The batch was aborted.\n{}", snapshot_index_scheduler(&self.index_scheduler)), breakpoint => panic!("Encountered an impossible breakpoint `{:?}`, this is probably an issue with the test suite.", breakpoint), } } @@ -3052,8 +3078,10 @@ mod tests { let rtxn = index.read_txn().unwrap(); let configs = index.embedding_configs(&rtxn).unwrap(); - let (_, embedding_config) = configs.first().unwrap(); - insta::assert_json_snapshot!(embedding_config.embedder_options); + let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); + insta::assert_snapshot!(name, @"default"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_json_snapshot!(config.embedder_options); } #[test] @@ -4989,7 +5017,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after_registering_settings_task_vectors"); @@ -5000,7 +5027,7 @@ mod tests { insta::assert_json_snapshot!(task.details); } - handle.advance_n_successful_batches(1); + handle.advance_one_successful_batch(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "settings_update_processed_vectors"); { @@ -5017,13 +5044,17 @@ mod tests { let configs = index.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let (name, fakerest_config) = configs.get(0).unwrap(); - insta::assert_json_snapshot!(name, @r###""A_fakerest""###); + let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let (name, simple_hf_config) = configs.get(1).unwrap(); - insta::assert_json_snapshot!(name, @r###""B_small_hf""###); + let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = + configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); @@ -5038,25 +5069,25 @@ mod tests { // add one doc, specifying vectors let doc = serde_json::json!( - { - "id": 0, - "doggo": "Intel", - "breed": "beagle", - "_vectors": { - &fakerest_name: { - // this will never trigger regeneration, which is good because we can't actually generate with - // this embedder - "userProvided": true, - "embeddings": beagle_embed, - }, - &simple_hf_name: { - // this will be regenerated on updates - "userProvided": false, - "embeddings": lab_embed, - }, - "noise": [0.1, 0.2, 0.3] - } - } + { + "id": 0, + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + &fakerest_name: { + // this will never trigger regeneration, which is good because we can't actually generate with + // this embedder + "regenerate": false, + "embeddings": beagle_embed, + }, + &simple_hf_name: { + // this will be regenerated on updates + "regenerate": true, + "embeddings": lab_embed, + }, + "noise": [0.1, 0.2, 0.3] + } + } ); let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0u128).unwrap(); @@ -5078,7 +5109,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "after adding Intel"); @@ -5091,6 +5121,19 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); @@ -5140,7 +5183,6 @@ mod tests { false, ) .unwrap(); - index_scheduler.assert_internally_consistent(); snapshot!(snapshot_index_scheduler(&index_scheduler), name: "Intel to kefir"); @@ -5153,11 +5195,25 @@ mod tests { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + // Ensure the document have been inserted into the relevant bitamp + let configs = index.embedding_configs(&rtxn).unwrap(); + // for consistency with the below + #[allow(clippy::get_first)] + let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = + configs.get(0).unwrap(); + insta::assert_snapshot!(name, @"A_fakerest"); + insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + + let IndexEmbeddingConfig { name, config: _, user_provided } = + configs.get(1).unwrap(); + insta::assert_snapshot!(name, @"B_small_hf"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + let embeddings = index.embeddings(&rtxn, 0).unwrap(); - // automatically changed to patou + // automatically changed to patou because set to regenerate assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); - // remained beagle because set to userProvided + // remained beagle assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; @@ -5176,4 +5232,578 @@ mod tests { } } } + + #[test] + fn import_vectors_first_and_embedder_later() { + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "my_doggo_embedder": vec![1; 384], + "unknown embedder": vec![1, 2, 3], + } + }, + { + "id": 2, + "doggo": "max", + "_vectors": { + "my_doggo_embedder": { + "regenerate": false, + "embeddings": vec![2; 384], + }, + "unknown embedder": vec![4, 5], + }, + }, + { + "id": 3, + "doggo": "marcel", + "_vectors": { + "my_doggo_embedder": { + "regenerate": true, + "embeddings": vec![3; 384], + }, + }, + }, + { + "id": 4, + "doggo": "sora", + "_vectors": { + "my_doggo_embedder": { + "regenerate": true, + }, + }, + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"5"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), name: "documents after initial push"); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }) + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + index_scheduler.assert_internally_consistent(); + handle.advance_one_successful_batch(); + index_scheduler.assert_internally_consistent(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + // the all the vectors linked to the new specified embedder have been removed + // Only the unknown embedders stays in the document DB + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + // even though we specified the vector for the ID 3, it shouldn't be marked + // as user provided since we explicitely marked it as NOT user provided. + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "my_doggo_embedder", + config: EmbeddingConfig { + embedder_options: HuggingFace( + EmbedderOptions { + model: "sentence-transformers/all-MiniLM-L6-v2", + revision: Some( + "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + ), + distribution: None, + }, + ), + prompt: PromptData { + template: "{{doc.doggo}}", + }, + }, + user_provided: RoaringBitmap<[1, 2]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + // the document with the id 3 should keep its original embedding + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let mut embeddings = Vec::new(); + + 'vectors: for i in 0..=u8::MAX { + let reader = arroy::Reader::open(&rtxn, i as u16, index.vector_arroy) + .map(Some) + .or_else(|e| match e { + arroy::Error::MissingMetadata => Ok(None), + e => Err(e), + }) + .transpose(); + + let Some(reader) = reader else { + break 'vectors; + }; + + let embedding = reader.unwrap().item_vector(&rtxn, docid).unwrap(); + if let Some(embedding) = embedding { + embeddings.push(embedding) + } else { + break 'vectors; + } + } + + snapshot!(embeddings.len(), @"1"); + assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); + + // If we update marcel it should regenerate its embedding automatically + + let content = serde_json::json!( + [ + { + "id": 3, + "doggo": "marvel", + }, + { + "id": 4, + "doggo": "sorry", + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(1_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: UpdateDocuments, + content_file: uuid, + documents_count, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + // the document with the id 3 should have its original embedding updated + let rtxn = index.read_txn().unwrap(); + let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); + let doc = index.documents(&rtxn, Some(docid)).unwrap()[0]; + let doc = obkv_to_json(&field_ids, &field_ids_map, doc.1).unwrap(); + snapshot!(json_string!(doc), @r###" + { + "id": 3, + "doggo": "marvel" + } + "###); + + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); + + // the document with the id 4 should generate an embedding + let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["my_doggo_embedder"]; + + assert!(!embedding.is_empty()); + } + + #[test] + fn delete_document_containing_vector() { + // 1. Add an embedder + // 2. Push two documents containing a simple vector + // 3. Delete the first document + // 4. The user defined roaring bitmap shouldn't contains the id of the first document anymore + // 5. Clear the index + // 6. The user defined roaring bitmap shouldn't contains the id of the second document + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }) + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + index_scheduler + .register( + KindWithContent::DocumentDeletion { + index_uid: S("doggos"), + documents_ids: vec![S("1")], + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); + let conf = index.embedding_configs(&rtxn).unwrap(); + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[0]>, + }, + ] + "###); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); + let embeddings = index.embeddings(&rtxn, docid).unwrap(); + let embedding = &embeddings["manual"]; + assert!(!embedding.is_empty(), "{embedding:?}"); + + index_scheduler + .register(KindWithContent::DocumentClear { index_uid: S("doggos") }, None, false) + .unwrap(); + handle.advance_one_successful_batch(); + + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); + let conf = index.embedding_configs(&rtxn).unwrap(); + snapshot!(format!("{conf:#?}"), @r###" + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, + }, + ), + prompt: PromptData { + template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + }, + }, + user_provided: RoaringBitmap<[]>, + }, + ] + "###); + } + + #[test] + fn delete_embedder_with_user_provided_vectors() { + // 1. Add two embedders + // 2. Push two documents containing a simple vector + // 3. The documents must not contain the vectors after the update as they are in the vectors db + // 3. Delete the embedders + // 4. The documents contain the vectors again + let (index_scheduler, mut handle) = IndexScheduler::test(true, vec![]); + + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::UserProvided), + dimensions: Setting::Set(3), + ..Default::default() + }), + S("my_doggo_embedder") => Setting::Set(EmbeddingSettings { + source: Setting::Set(milli::vector::settings::EmbedderSource::HuggingFace), + model: Setting::Set(S("sentence-transformers/all-MiniLM-L6-v2")), + revision: Setting::Set(S("e4ce9877abf3edfe10b0d82785e83bdcb973e22e")), + document_template: Setting::Set(S("{{doc.doggo}}")), + ..Default::default() + }), + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + let content = serde_json::json!( + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "manual": vec![0, 0, 0], + "my_doggo_embedder": vec![1; 384], + } + }, + { + "id": 1, + "doggo": "intel", + "_vectors": { + "manual": vec![1, 1, 1], + } + }, + ] + ); + + let (uuid, mut file) = index_scheduler.create_update_file_with_uuid(0_u128).unwrap(); + let documents_count = + read_json(serde_json::to_string_pretty(&content).unwrap().as_bytes(), &mut file) + .unwrap(); + snapshot!(documents_count, @"2"); + file.persist().unwrap(); + + index_scheduler + .register( + KindWithContent::DocumentAdditionOrUpdate { + index_uid: S("doggos"), + primary_key: None, + method: ReplaceDocuments, + content_file: uuid, + documents_count, + allow_index_creation: false, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel"}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Set(maplit::btreemap! { + S("manual") => Setting::Reset, + }), + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir","_vectors":{"manual":{"embeddings":[[0.0,0.0,0.0]],"regenerate":false}}},{"id":1,"doggo":"intel","_vectors":{"manual":{"embeddings":[[1.0,1.0,1.0]],"regenerate":false}}}]"###); + } + + { + let setting = meilisearch_types::settings::Settings:: { + embedders: Setting::Reset, + ..Default::default() + }; + index_scheduler + .register( + KindWithContent::SettingsUpdate { + index_uid: S("doggos"), + new_settings: Box::new(setting), + is_deletion: false, + allow_index_creation: true, + }, + None, + false, + ) + .unwrap(); + handle.advance_one_successful_batch(); + } + + { + let index = index_scheduler.index("doggos").unwrap(); + let rtxn = index.read_txn().unwrap(); + let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); + let field_ids = field_ids_map.ids().collect::>(); + let documents = index + .all_documents(&rtxn) + .unwrap() + .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .collect::>(); + + // FIXME: redaction + snapshot!(json_string!(serde_json::to_string(&documents).unwrap(), { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###""[{\"id\":0,\"doggo\":\"kefir\",\"_vectors\":{\"manual\":{\"embeddings\":[[0.0,0.0,0.0]],\"regenerate\":false},\"my_doggo_embedder\":{\"embeddings\":[[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0]],\"regenerate\":false}}},{\"id\":1,\"doggo\":\"intel\",\"_vectors\":{\"manual\":{\"embeddings\":[[1.0,1.0,1.0]],\"regenerate\":false}}}]""###); + } + } } diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap index 002a42e59..540835dfb 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-9.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-15.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "Intel", "breed": "beagle", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap similarity index 67% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap index 718ea229c..bc35d84f6 100644 --- a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-12.snap +++ b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-22.snap @@ -6,10 +6,6 @@ expression: doc "doggo": "kefir", "breed": "patou", "_vectors": { - "A_fakerest": { - "embeddings": "[vector]", - "userProvided": true - }, "noise": [ 0.1, 0.2, diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-4.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-5.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap b/index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-6.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__import_vectors-8.snap diff --git a/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap b/index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap similarity index 100% rename from index-scheduler/src/snapshots/index_scheduler__tests__settings_update-3.snap rename to index-scheduler/src/snapshots/index_scheduler__tests__settings_update-5.snap diff --git a/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap new file mode 100644 index 000000000..d2473d00a --- /dev/null +++ b/index-scheduler/src/snapshots/lib.rs/import_vectors_first_and_embedder_later/documents after initial push.snap @@ -0,0 +1,4 @@ +--- +source: index-scheduler/src/lib.rs +--- +[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"my_doggo_embedder":[1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0],"unknown embedder":[1.0,2.0,3.0]}},{"id":2,"doggo":"max","_vectors":{"my_doggo_embedder":{"embeddings":[2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0],"regenerate":false},"unknown embedder":[4.0,5.0]}},{"id":3,"doggo":"marcel","_vectors":{"my_doggo_embedder":{"embeddings":[3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0,3.0],"regenerate":true}}},{"id":4,"doggo":"sora","_vectors":{"my_doggo_embedder":{"embeddings":null,"regenerate":true}}}] diff --git a/meilisearch-types/src/error.rs b/meilisearch-types/src/error.rs index 150c56b9d..ae2a753db 100644 --- a/meilisearch-types/src/error.rs +++ b/meilisearch-types/src/error.rs @@ -222,6 +222,7 @@ InvalidApiKeyUid , InvalidRequest , BAD_REQUEST ; InvalidContentType , InvalidRequest , UNSUPPORTED_MEDIA_TYPE ; InvalidDocumentCsvDelimiter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFields , InvalidRequest , BAD_REQUEST ; +InvalidDocumentRetrieveVectors , InvalidRequest , BAD_REQUEST ; MissingDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentFilter , InvalidRequest , BAD_REQUEST ; InvalidDocumentGeoField , InvalidRequest , BAD_REQUEST ; @@ -240,9 +241,11 @@ InvalidSearchAttributesToSearchOn , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToCrop , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToHighlight , InvalidRequest , BAD_REQUEST ; InvalidSimilarAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; +InvalidSimilarRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchAttributesToRetrieve , InvalidRequest , BAD_REQUEST ; InvalidSearchRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; InvalidSimilarRankingScoreThreshold , InvalidRequest , BAD_REQUEST ; +InvalidSearchRetrieveVectors , InvalidRequest , BAD_REQUEST ; InvalidSearchCropLength , InvalidRequest , BAD_REQUEST ; InvalidSearchCropMarker , InvalidRequest , BAD_REQUEST ; InvalidSearchFacets , InvalidRequest , BAD_REQUEST ; diff --git a/meilisearch-types/src/settings.rs b/meilisearch-types/src/settings.rs index 223d71658..8a9708d29 100644 --- a/meilisearch-types/src/settings.rs +++ b/meilisearch-types/src/settings.rs @@ -8,6 +8,7 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; +use milli::index::IndexEmbeddingConfig; use milli::proximity::ProximityPrecision; use milli::update::Setting; use milli::{Criterion, CriterionError, Index, DEFAULT_VALUES_PER_FACET}; @@ -672,7 +673,7 @@ pub fn settings( let embedders: BTreeMap<_, _> = index .embedding_configs(rtxn)? .into_iter() - .map(|(name, config)| (name, Setting::Set(config.into()))) + .map(|IndexEmbeddingConfig { name, config, .. }| (name, Setting::Set(config.into()))) .collect(); let embedders = if embedders.is_empty() { Setting::NotSet } else { Setting::Set(embedders) }; diff --git a/meilisearch/src/analytics/mod.rs b/meilisearch/src/analytics/mod.rs index 3468ad2c7..6863dc57b 100644 --- a/meilisearch/src/analytics/mod.rs +++ b/meilisearch/src/analytics/mod.rs @@ -74,8 +74,8 @@ pub enum DocumentDeletionKind { #[derive(Copy, Clone, Debug, PartialEq, Eq)] pub enum DocumentFetchKind { - PerDocumentId, - Normal { with_filter: bool, limit: usize, offset: usize }, + PerDocumentId { retrieve_vectors: bool }, + Normal { with_filter: bool, limit: usize, offset: usize, retrieve_vectors: bool }, } pub trait Analytics: Sync + Send { diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index aed29e612..56a781c47 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -622,6 +622,7 @@ pub struct SearchAggregator { // Whether a non-default embedder was specified embedder: bool, hybrid: bool, + retrieve_vectors: bool, // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, @@ -662,6 +663,7 @@ impl SearchAggregator { page, hits_per_page, attributes_to_retrieve: _, + retrieve_vectors, attributes_to_crop: _, crop_length, attributes_to_highlight: _, @@ -728,6 +730,7 @@ impl SearchAggregator { if let Some(ref vector) = vector { ret.max_vector_size = vector.len(); } + ret.retrieve_vectors |= retrieve_vectors; if query.is_finite_pagination() { let limit = hits_per_page.unwrap_or_else(DEFAULT_SEARCH_LIMIT); @@ -803,6 +806,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -873,6 +877,7 @@ impl SearchAggregator { // vector self.max_vector_size = self.max_vector_size.max(max_vector_size); + self.retrieve_vectors |= retrieve_vectors; self.semantic_ratio |= semantic_ratio; self.hybrid |= hybrid; self.embedder |= embedder; @@ -929,6 +934,7 @@ impl SearchAggregator { attributes_to_search_on_total_number_of_uses, max_terms_number, max_vector_size, + retrieve_vectors, matching_strategy, max_limit, max_offset, @@ -991,6 +997,7 @@ impl SearchAggregator { }, "vector": { "max_vector_size": max_vector_size, + "retrieve_vectors": retrieve_vectors, }, "hybrid": { "enabled": hybrid, @@ -1079,6 +1086,7 @@ impl MultiSearchAggregator { page: _, hits_per_page: _, attributes_to_retrieve: _, + retrieve_vectors: _, attributes_to_crop: _, crop_length: _, attributes_to_highlight: _, @@ -1534,6 +1542,9 @@ pub struct DocumentsFetchAggregator { // if a filter was used per_filter: bool, + #[serde(rename = "vector.retrieve_vectors")] + retrieve_vectors: bool, + // pagination #[serde(rename = "pagination.max_limit")] max_limit: usize, @@ -1543,18 +1554,21 @@ pub struct DocumentsFetchAggregator { impl DocumentsFetchAggregator { pub fn from_query(query: &DocumentFetchKind, request: &HttpRequest) -> Self { - let (limit, offset) = match query { - DocumentFetchKind::PerDocumentId => (1, 0), - DocumentFetchKind::Normal { limit, offset, .. } => (*limit, *offset), + let (limit, offset, retrieve_vectors) = match query { + DocumentFetchKind::PerDocumentId { retrieve_vectors } => (1, 0, *retrieve_vectors), + DocumentFetchKind::Normal { limit, offset, retrieve_vectors, .. } => { + (*limit, *offset, *retrieve_vectors) + } }; Self { timestamp: Some(OffsetDateTime::now_utc()), user_agents: extract_user_agents(request).into_iter().collect(), total_received: 1, - per_document_id: matches!(query, DocumentFetchKind::PerDocumentId), + per_document_id: matches!(query, DocumentFetchKind::PerDocumentId { .. }), per_filter: matches!(query, DocumentFetchKind::Normal { with_filter, .. } if *with_filter), max_limit: limit, max_offset: offset, + retrieve_vectors, } } @@ -1568,6 +1582,7 @@ impl DocumentsFetchAggregator { per_filter, max_limit, max_offset, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1583,6 +1598,8 @@ impl DocumentsFetchAggregator { self.max_limit = self.max_limit.max(max_limit); self.max_offset = self.max_offset.max(max_offset); + + self.retrieve_vectors |= retrieve_vectors; } pub fn into_event(self, user: &User, event_name: &str) -> Option { @@ -1623,6 +1640,7 @@ pub struct SimilarAggregator { // Whether a non-default embedder was specified embedder: bool, + retrieve_vectors: bool, // pagination max_limit: usize, @@ -1646,6 +1664,7 @@ impl SimilarAggregator { offset, limit, attributes_to_retrieve: _, + retrieve_vectors, show_ranking_score, show_ranking_score_details, filter, @@ -1690,6 +1709,7 @@ impl SimilarAggregator { ret.ranking_score_threshold = ranking_score_threshold.is_some(); ret.embedder = embedder.is_some(); + ret.retrieve_vectors = *retrieve_vectors; ret } @@ -1722,6 +1742,7 @@ impl SimilarAggregator { show_ranking_score_details, embedder, ranking_score_threshold, + retrieve_vectors, } = other; if self.timestamp.is_none() { @@ -1751,6 +1772,7 @@ impl SimilarAggregator { } self.embedder |= embedder; + self.retrieve_vectors |= retrieve_vectors; // pagination self.max_limit = self.max_limit.max(max_limit); @@ -1785,6 +1807,7 @@ impl SimilarAggregator { show_ranking_score_details, embedder, ranking_score_threshold, + retrieve_vectors, } = self; if total_received == 0 { @@ -1811,6 +1834,9 @@ impl SimilarAggregator { "avg_criteria_number": format!("{:.2}", filter_sum_of_criteria_terms as f64 / filter_total_number_of_criteria as f64), "most_used_syntax": used_syntax.iter().max_by_key(|(_, v)| *v).map(|(k, _)| json!(k)).unwrap_or_else(|| json!(null)), }, + "vector": { + "retrieve_vectors": retrieve_vectors, + }, "hybrid": { "embedder": embedder, }, diff --git a/meilisearch/src/routes/indexes/documents.rs b/meilisearch/src/routes/indexes/documents.rs index 43fab1dae..1f413ec7d 100644 --- a/meilisearch/src/routes/indexes/documents.rs +++ b/meilisearch/src/routes/indexes/documents.rs @@ -16,6 +16,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::update::IndexDocumentsMethod; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::DocumentId; use meilisearch_types::star_or::OptionStarOrList; use meilisearch_types::tasks::KindWithContent; @@ -39,7 +40,7 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::{ get_task_id, is_dry_run, PaginationView, SummarizedTaskView, PAGINATION_DEFAULT_LIMIT, }; -use crate::search::parse_filter; +use crate::search::{parse_filter, RetrieveVectors}; use crate::Opt; static ACCEPTED_CONTENT_TYPE: Lazy> = Lazy::new(|| { @@ -94,6 +95,8 @@ pub fn configure(cfg: &mut web::ServiceConfig) { pub struct GetDocument { #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, } pub async fn get_document( @@ -107,13 +110,20 @@ pub async fn get_document( debug!(parameters = ?params, "Get document"); let index_uid = IndexUid::try_from(index_uid)?; - analytics.get_fetch_documents(&DocumentFetchKind::PerDocumentId, &req); - - let GetDocument { fields } = params.into_inner(); + let GetDocument { fields, retrieve_vectors: param_retrieve_vectors } = params.into_inner(); let attributes_to_retrieve = fields.merge_star_and_none(); + let features = index_scheduler.features(); + let retrieve_vectors = RetrieveVectors::new(param_retrieve_vectors.0, features)?; + + analytics.get_fetch_documents( + &DocumentFetchKind::PerDocumentId { retrieve_vectors: param_retrieve_vectors.0 }, + &req, + ); + let index = index_scheduler.index(&index_uid)?; - let document = retrieve_document(&index, &document_id, attributes_to_retrieve)?; + let document = + retrieve_document(&index, &document_id, attributes_to_retrieve, retrieve_vectors)?; debug!(returns = ?document, "Get document"); Ok(HttpResponse::Ok().json(document)) } @@ -153,6 +163,8 @@ pub struct BrowseQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] fields: OptionStarOrList, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, } @@ -166,6 +178,8 @@ pub struct BrowseQuery { limit: usize, #[deserr(default, error = DeserrJsonError)] fields: Option>, + #[deserr(default, error = DeserrJsonError)] + retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] filter: Option, } @@ -185,6 +199,7 @@ pub async fn documents_by_query_post( with_filter: body.filter.is_some(), limit: body.limit, offset: body.offset, + retrieve_vectors: body.retrieve_vectors, }, &req, ); @@ -201,7 +216,7 @@ pub async fn get_documents( ) -> Result { debug!(parameters = ?params, "Get documents GET"); - let BrowseQueryGet { limit, offset, fields, filter } = params.into_inner(); + let BrowseQueryGet { limit, offset, fields, retrieve_vectors, filter } = params.into_inner(); let filter = match filter { Some(f) => match serde_json::from_str(&f) { @@ -215,6 +230,7 @@ pub async fn get_documents( offset: offset.0, limit: limit.0, fields: fields.merge_star_and_none(), + retrieve_vectors: retrieve_vectors.0, filter, }; @@ -223,6 +239,7 @@ pub async fn get_documents( with_filter: query.filter.is_some(), limit: query.limit, offset: query.offset, + retrieve_vectors: query.retrieve_vectors, }, &req, ); @@ -236,10 +253,14 @@ fn documents_by_query( query: BrowseQuery, ) -> Result { let index_uid = IndexUid::try_from(index_uid.into_inner())?; - let BrowseQuery { offset, limit, fields, filter } = query; + let BrowseQuery { offset, limit, fields, retrieve_vectors, filter } = query; + + let features = index_scheduler.features(); + let retrieve_vectors = RetrieveVectors::new(retrieve_vectors, features)?; let index = index_scheduler.index(&index_uid)?; - let (total, documents) = retrieve_documents(&index, offset, limit, filter, fields)?; + let (total, documents) = + retrieve_documents(&index, offset, limit, filter, fields, retrieve_vectors)?; let ret = PaginationView::new(offset, limit, total as usize, documents); @@ -579,13 +600,44 @@ fn some_documents<'a, 't: 'a>( index: &'a Index, rtxn: &'t RoTxn, doc_ids: impl IntoIterator + 'a, + retrieve_vectors: RetrieveVectors, ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); + let embedding_configs = index.embedding_configs(rtxn)?; Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { - ret.map_err(ResponseError::from).and_then(|(_key, document)| -> Result<_, ResponseError> { - Ok(milli::obkv_to_json(&all_fields, &fields_ids_map, document)?) + ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { + let mut document = milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; + match retrieve_vectors { + RetrieveVectors::Ignore => {} + RetrieveVectors::Hide => { + document.remove("_vectors"); + } + RetrieveVectors::Retrieve => { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; + for (name, vector) in index.embeddings(rtxn, key)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(key)); + let embeddings = ExplicitVectors { + embeddings: Some(vector.into()), + regenerate: !user_provided, + }; + vectors.insert( + name, + serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, + ); + } + document.insert("_vectors".into(), vectors.into()); + } + } + + Ok(document) }) })) } @@ -596,6 +648,7 @@ fn retrieve_documents>( limit: usize, filter: Option, attributes_to_retrieve: Option>, + retrieve_vectors: RetrieveVectors, ) -> Result<(u64, Vec), ResponseError> { let rtxn = index.read_txn()?; let filter = &filter; @@ -620,53 +673,57 @@ fn retrieve_documents>( let (it, number_of_documents) = { let number_of_documents = candidates.len(); ( - some_documents(index, &rtxn, candidates.into_iter().skip(offset).take(limit))?, + some_documents( + index, + &rtxn, + candidates.into_iter().skip(offset).take(limit), + retrieve_vectors, + )?, number_of_documents, ) }; - let documents: Result, ResponseError> = it + let documents: Vec<_> = it .map(|document| { Ok(match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document?, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve.iter().map(|s| s.as_ref()).chain( + (retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors"), + ), ), None => document?, }) }) - .collect(); + .collect::>()?; - Ok((number_of_documents, documents?)) + Ok((number_of_documents, documents)) } fn retrieve_document>( index: &Index, doc_id: &str, attributes_to_retrieve: Option>, + retrieve_vectors: RetrieveVectors, ) -> Result { let txn = index.read_txn()?; - let fields_ids_map = index.fields_ids_map(&txn)?; - let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let internal_id = index .external_documents_ids() .get(&txn, doc_id)? .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; - let document = index - .documents(&txn, std::iter::once(internal_id))? - .into_iter() + let document = some_documents(index, &txn, Some(internal_id), retrieve_vectors)? .next() - .map(|(_, d)| d) - .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))?; + .ok_or_else(|| MeilisearchHttpError::DocumentNotFound(doc_id.to_string()))??; - let document = meilisearch_types::milli::obkv_to_json(&all_fields, &fields_ids_map, document)?; let document = match &attributes_to_retrieve { Some(attributes_to_retrieve) => permissive_json_pointer::select_values( &document, - attributes_to_retrieve.iter().map(|s| s.as_ref()), + attributes_to_retrieve + .iter() + .map(|s| s.as_ref()) + .chain((retrieve_vectors == RetrieveVectors::Retrieve).then_some("_vectors")), ), None => document, }; diff --git a/meilisearch/src/routes/indexes/facet_search.rs b/meilisearch/src/routes/indexes/facet_search.rs index 10b371f2d..2e9cf6e1b 100644 --- a/meilisearch/src/routes/indexes/facet_search.rs +++ b/meilisearch/src/routes/indexes/facet_search.rs @@ -115,6 +115,7 @@ impl From for SearchQuery { page: None, hits_per_page: None, attributes_to_retrieve: None, + retrieve_vectors: false, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), attributes_to_highlight: None, diff --git a/meilisearch/src/routes/indexes/search.rs b/meilisearch/src/routes/indexes/search.rs index 348d8295c..421cf2940 100644 --- a/meilisearch/src/routes/indexes/search.rs +++ b/meilisearch/src/routes/indexes/search.rs @@ -20,9 +20,9 @@ use crate::extractors::sequential_extractor::SeqHandler; use crate::metrics::MEILISEARCH_DEGRADED_SEARCH_REQUESTS; use crate::search::{ add_search_rules, perform_search, HybridQuery, MatchingStrategy, RankingScoreThreshold, - SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, DEFAULT_CROP_MARKER, - DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, DEFAULT_SEARCH_LIMIT, - DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, + RetrieveVectors, SearchKind, SearchQuery, SemanticRatio, DEFAULT_CROP_LENGTH, + DEFAULT_CROP_MARKER, DEFAULT_HIGHLIGHT_POST_TAG, DEFAULT_HIGHLIGHT_PRE_TAG, + DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, DEFAULT_SEMANTIC_RATIO, }; use crate::search_queue::SearchQueue; @@ -51,6 +51,8 @@ pub struct SearchQueryGet { hits_per_page: Option>, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_crop: Option>, #[deserr(default = Param(DEFAULT_CROP_LENGTH()), error = DeserrQueryParamError)] @@ -153,6 +155,7 @@ impl From for SearchQuery { page: other.page.as_deref().copied(), hits_per_page: other.hits_per_page.as_deref().copied(), attributes_to_retrieve: other.attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: other.retrieve_vectors.0, attributes_to_crop: other.attributes_to_crop.map(|o| o.into_iter().collect()), crop_length: other.crop_length.0, attributes_to_highlight: other.attributes_to_highlight.map(|o| o.into_iter().collect()), @@ -222,10 +225,12 @@ pub async fn search_with_url_query( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; - + let retrieve_vector = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); } @@ -262,10 +267,13 @@ pub async fn search_with_post( let features = index_scheduler.features(); let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features)?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; let _permit = search_queue.try_get_search_permit().await?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)).await?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vectors) + }) + .await?; if let Ok(ref search_result) = search_result { aggregate.succeed(search_result); if search_result.degraded { @@ -287,11 +295,10 @@ pub fn search_kind( features: RoFeatures, ) -> Result { if query.vector.is_some() { - features.check_vector("Passing `vector` as a query parameter")?; + features.check_vector("Passing `vector` as a parameter")?; } - if query.hybrid.is_some() { - features.check_vector("Passing `hybrid` as a query parameter")?; + features.check_vector("Passing `hybrid` as a parameter")?; } // regardless of anything, always do a keyword search when we don't have a vector and the query is whitespace or missing diff --git a/meilisearch/src/routes/indexes/similar.rs b/meilisearch/src/routes/indexes/similar.rs index 518fedab7..1dd83b09b 100644 --- a/meilisearch/src/routes/indexes/similar.rs +++ b/meilisearch/src/routes/indexes/similar.rs @@ -4,11 +4,7 @@ use deserr::actix_web::{AwebJson, AwebQueryParameter}; use index_scheduler::IndexScheduler; use meilisearch_types::deserr::query_params::Param; use meilisearch_types::deserr::{DeserrJsonError, DeserrQueryParamError}; -use meilisearch_types::error::deserr_codes::{ - InvalidEmbedder, InvalidSimilarAttributesToRetrieve, InvalidSimilarFilter, InvalidSimilarId, - InvalidSimilarLimit, InvalidSimilarOffset, InvalidSimilarRankingScoreThreshold, - InvalidSimilarShowRankingScore, InvalidSimilarShowRankingScoreDetails, -}; +use meilisearch_types::error::deserr_codes::*; use meilisearch_types::error::{ErrorCode as _, ResponseError}; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::keys::actions; @@ -21,8 +17,8 @@ use crate::analytics::{Analytics, SimilarAggregator}; use crate::extractors::authentication::GuardedData; use crate::extractors::sequential_extractor::SeqHandler; use crate::search::{ - add_search_rules, perform_similar, RankingScoreThresholdSimilar, SearchKind, SimilarQuery, - SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, + add_search_rules, perform_similar, RankingScoreThresholdSimilar, RetrieveVectors, SearchKind, + SimilarQuery, SimilarResult, DEFAULT_SEARCH_LIMIT, DEFAULT_SEARCH_OFFSET, }; pub fn configure(cfg: &mut web::ServiceConfig) { @@ -97,6 +93,8 @@ async fn similar( features.check_vector("Using the similar API")?; + let retrieve_vectors = RetrieveVectors::new(query.retrieve_vectors, features)?; + // Tenant token search_rules. if let Some(search_rules) = index_scheduler.filters().get_index_search_rules(&index_uid) { add_search_rules(&mut query.filter, search_rules); @@ -107,8 +105,10 @@ async fn similar( let (embedder_name, embedder) = SearchKind::embedder(&index_scheduler, &index, query.embedder.as_deref(), None)?; - tokio::task::spawn_blocking(move || perform_similar(&index, query, embedder_name, embedder)) - .await? + tokio::task::spawn_blocking(move || { + perform_similar(&index, query, embedder_name, embedder, retrieve_vectors) + }) + .await? } #[derive(Debug, deserr::Deserr)] @@ -122,6 +122,8 @@ pub struct SimilarQueryGet { limit: Param, #[deserr(default, error = DeserrQueryParamError)] attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrQueryParamError)] + retrieve_vectors: Param, #[deserr(default, error = DeserrQueryParamError)] filter: Option, #[deserr(default, error = DeserrQueryParamError)] @@ -156,6 +158,7 @@ impl TryFrom for SimilarQuery { offset, limit, attributes_to_retrieve, + retrieve_vectors, filter, show_ranking_score, show_ranking_score_details, @@ -180,6 +183,7 @@ impl TryFrom for SimilarQuery { filter, embedder, attributes_to_retrieve: attributes_to_retrieve.map(|o| o.into_iter().collect()), + retrieve_vectors: retrieve_vectors.0, show_ranking_score: show_ranking_score.0, show_ranking_score_details: show_ranking_score_details.0, ranking_score_threshold: ranking_score_threshold.map(|x| x.0), diff --git a/meilisearch/src/routes/multi_search.rs b/meilisearch/src/routes/multi_search.rs index a83dc4bc0..1d697dac6 100644 --- a/meilisearch/src/routes/multi_search.rs +++ b/meilisearch/src/routes/multi_search.rs @@ -15,7 +15,7 @@ use crate::extractors::authentication::{AuthenticationError, GuardedData}; use crate::extractors::sequential_extractor::SeqHandler; use crate::routes::indexes::search::search_kind; use crate::search::{ - add_search_rules, perform_search, SearchQueryWithIndex, SearchResultWithIndex, + add_search_rules, perform_search, RetrieveVectors, SearchQueryWithIndex, SearchResultWithIndex, }; use crate::search_queue::SearchQueue; @@ -83,11 +83,14 @@ pub async fn multi_search_with_post( let search_kind = search_kind(&query, index_scheduler.get_ref(), &index, features) .with_index(query_index)?; + let retrieve_vector = + RetrieveVectors::new(query.retrieve_vectors, features).with_index(query_index)?; - let search_result = - tokio::task::spawn_blocking(move || perform_search(&index, query, search_kind)) - .await - .with_index(query_index)?; + let search_result = tokio::task::spawn_blocking(move || { + perform_search(&index, query, search_kind, retrieve_vector) + }) + .await + .with_index(query_index)?; search_results.push(SearchResultWithIndex { index_uid: index_uid.into_inner(), diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 05b3c1aff..9632e3f5d 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -15,6 +15,7 @@ use meilisearch_types::error::{Code, ResponseError}; use meilisearch_types::heed::RoTxn; use meilisearch_types::index_uid::IndexUid; use meilisearch_types::milli::score_details::{ScoreDetails, ScoringStrategy}; +use meilisearch_types::milli::vector::parsed_vectors::ExplicitVectors; use meilisearch_types::milli::vector::Embedder; use meilisearch_types::milli::{FacetValueHit, OrderBy, SearchForFacetValues, TimeBudget}; use meilisearch_types::settings::DEFAULT_PAGINATION_MAX_TOTAL_HITS; @@ -59,6 +60,8 @@ pub struct SearchQuery { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -141,6 +144,7 @@ impl fmt::Debug for SearchQuery { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -173,6 +177,9 @@ impl fmt::Debug for SearchQuery { if let Some(q) = q { debug.field("q", &q); } + if *retrieve_vectors { + debug.field("retrieve_vectors", &retrieve_vectors); + } if let Some(v) = vector { if v.len() < 10 { debug.field("vector", &v); @@ -370,6 +377,8 @@ pub struct SearchQueryWithIndex { pub hits_per_page: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError)] pub attributes_to_crop: Option>, #[deserr(default, error = DeserrJsonError, default = DEFAULT_CROP_LENGTH())] @@ -413,6 +422,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -440,6 +450,7 @@ impl SearchQueryWithIndex { page, hits_per_page, attributes_to_retrieve, + retrieve_vectors, attributes_to_crop, crop_length, attributes_to_highlight, @@ -478,6 +489,8 @@ pub struct SimilarQuery { pub embedder: Option, #[deserr(default, error = DeserrJsonError)] pub attributes_to_retrieve: Option>, + #[deserr(default, error = DeserrJsonError)] + pub retrieve_vectors: bool, #[deserr(default, error = DeserrJsonError, default)] pub show_ranking_score: bool, #[deserr(default, error = DeserrJsonError, default)] @@ -810,6 +823,7 @@ pub fn perform_search( index: &Index, query: SearchQuery, search_kind: SearchKind, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -847,6 +861,8 @@ pub fn perform_search( page, hits_per_page, attributes_to_retrieve, + // use the enum passed as parameter + retrieve_vectors: _, attributes_to_crop, crop_length, attributes_to_highlight, @@ -870,6 +886,7 @@ pub fn perform_search( let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight, attributes_to_crop, crop_length, @@ -953,6 +970,7 @@ pub fn perform_search( struct AttributesFormat { attributes_to_retrieve: Option>, + retrieve_vectors: RetrieveVectors, attributes_to_highlight: Option>, attributes_to_crop: Option>, crop_length: usize, @@ -965,6 +983,36 @@ struct AttributesFormat { show_ranking_score_details: bool, } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RetrieveVectors { + /// Do not touch the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is disabled + Ignore, + /// Remove the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `false` + Hide, + /// Retrieve vectors from the DB and merge them into the `_vectors` field + /// + /// this is the behavior when the vectorStore feature is enabled, and `retrieveVectors` is `true` + Retrieve, +} + +impl RetrieveVectors { + pub fn new( + retrieve_vector: bool, + features: index_scheduler::RoFeatures, + ) -> Result { + match (retrieve_vector, features.check_vector("Passing `retrieveVectors` as a parameter")) { + (true, Ok(())) => Ok(Self::Retrieve), + (true, Err(error)) => Err(error), + (false, Ok(())) => Ok(Self::Hide), + (false, Err(_)) => Ok(Self::Ignore), + } + } +} + fn make_hits( index: &Index, rtxn: &RoTxn<'_>, @@ -974,10 +1022,32 @@ fn make_hits( document_scores: Vec>, ) -> Result, MeilisearchHttpError> { let fields_ids_map = index.fields_ids_map(rtxn).unwrap(); - let displayed_ids = index - .displayed_fields_ids(rtxn)? - .map(|fields| fields.into_iter().collect::>()) - .unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); + let displayed_ids = + index.displayed_fields_ids(rtxn)?.map(|fields| fields.into_iter().collect::>()); + + let vectors_fid = fields_ids_map.id(milli::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + + let vectors_is_hidden = match (&displayed_ids, vectors_fid) { + // displayed_ids is a wildcard, so `_vectors` can be displayed regardless of its fid + (None, _) => false, + // displayed_ids is a finite list, and `_vectors` cannot be part of it because it is not an existing field + (Some(_), None) => true, + // displayed_ids is a finit list, so hide if `_vectors` is not part of it + (Some(map), Some(vectors_fid)) => map.contains(&vectors_fid), + }; + + let retrieve_vectors = if let RetrieveVectors::Retrieve = format.retrieve_vectors { + if vectors_is_hidden { + RetrieveVectors::Hide + } else { + RetrieveVectors::Retrieve + } + } else { + format.retrieve_vectors + }; + + let displayed_ids = + displayed_ids.unwrap_or_else(|| fields_ids_map.iter().map(|(id, _)| id).collect()); let fids = |attrs: &BTreeSet| { let mut ids = BTreeSet::new(); for attr in attrs { @@ -1000,6 +1070,7 @@ fn make_hits( .intersection(&displayed_ids) .cloned() .collect(); + let attr_to_highlight = format.attributes_to_highlight.unwrap_or_default(); let attr_to_crop = format.attributes_to_crop.unwrap_or_default(); let formatted_options = compute_formatted_options( @@ -1033,18 +1104,48 @@ fn make_hits( formatter_builder.highlight_prefix(format.highlight_pre_tag); formatter_builder.highlight_suffix(format.highlight_post_tag); let mut documents = Vec::new(); + let embedding_configs = index.embedding_configs(rtxn)?; let documents_iter = index.documents(rtxn, documents_ids)?; - for ((_id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { + for ((id, obkv), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { // First generate a document with all the displayed fields let displayed_document = make_document(&displayed_ids, &fields_ids_map, obkv)?; + let add_vectors_fid = + vectors_fid.filter(|_fid| retrieve_vectors == RetrieveVectors::Retrieve); + // select the attributes to retrieve let attributes_to_retrieve = to_retrieve_ids .iter() + // skip the vectors_fid if RetrieveVectors::Hide + .filter(|fid| match vectors_fid { + Some(vectors_fid) => { + !(retrieve_vectors == RetrieveVectors::Hide && **fid == vectors_fid) + } + None => true, + }) + // need to retrieve the existing `_vectors` field if the `RetrieveVectors::Retrieve` + .chain(add_vectors_fid.iter()) .map(|&fid| fields_ids_map.name(fid).expect("Missing field name")); let mut document = permissive_json_pointer::select_values(&displayed_document, attributes_to_retrieve); + if retrieve_vectors == RetrieveVectors::Retrieve { + let mut vectors = match document.remove("_vectors") { + Some(Value::Object(map)) => map, + _ => Default::default(), + }; + for (name, vector) in index.embeddings(rtxn, id)? { + let user_provided = embedding_configs + .iter() + .find(|conf| conf.name == name) + .is_some_and(|conf| conf.user_provided.contains(id)); + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; + vectors.insert(name, serde_json::to_value(embeddings)?); + } + document.insert("_vectors".into(), vectors.into()); + } + let (matches_position, formatted) = format_fields( &displayed_document, &fields_ids_map, @@ -1114,6 +1215,7 @@ pub fn perform_similar( query: SimilarQuery, embedder_name: String, embedder: Arc, + retrieve_vectors: RetrieveVectors, ) -> Result { let before_search = Instant::now(); let rtxn = index.read_txn()?; @@ -1125,6 +1227,7 @@ pub fn perform_similar( filter: _, embedder: _, attributes_to_retrieve, + retrieve_vectors: _, show_ranking_score, show_ranking_score_details, ranking_score_threshold, @@ -1171,6 +1274,7 @@ pub fn perform_similar( let format = AttributesFormat { attributes_to_retrieve, + retrieve_vectors, attributes_to_highlight: None, attributes_to_crop: None, crop_length: DEFAULT_CROP_LENGTH(), diff --git a/meilisearch/tests/common/index.rs b/meilisearch/tests/common/index.rs index 3ac33b4e9..114ede9b8 100644 --- a/meilisearch/tests/common/index.rs +++ b/meilisearch/tests/common/index.rs @@ -182,14 +182,10 @@ impl Index<'_> { self.service.get(url).await } - pub async fn get_document( - &self, - id: u64, - options: Option, - ) -> (Value, StatusCode) { + pub async fn get_document(&self, id: u64, options: Option) -> (Value, StatusCode) { let mut url = format!("/indexes/{}/documents/{}", urlencode(self.uid.as_ref()), id); - if let Some(fields) = options.and_then(|o| o.fields) { - let _ = write!(url, "?fields={}", fields.join(",")); + if let Some(options) = options { + write!(url, "?{}", yaup::to_string(&options).unwrap()).unwrap(); } self.service.get(url).await } @@ -205,18 +201,11 @@ impl Index<'_> { } pub async fn get_all_documents(&self, options: GetAllDocumentsOptions) -> (Value, StatusCode) { - let mut url = format!("/indexes/{}/documents?", urlencode(self.uid.as_ref())); - if let Some(limit) = options.limit { - let _ = write!(url, "limit={}&", limit); - } - - if let Some(offset) = options.offset { - let _ = write!(url, "offset={}&", offset); - } - - if let Some(attributes_to_retrieve) = options.attributes_to_retrieve { - let _ = write!(url, "fields={}&", attributes_to_retrieve.join(",")); - } + let url = format!( + "/indexes/{}/documents?{}", + urlencode(self.uid.as_ref()), + yaup::to_string(&options).unwrap() + ); self.service.get(url).await } @@ -435,13 +424,11 @@ impl Index<'_> { } } -pub struct GetDocumentOptions { - pub fields: Option>, -} - -#[derive(Debug, Default)] +#[derive(Debug, Default, serde::Serialize)] +#[serde(rename_all = "camelCase")] pub struct GetAllDocumentsOptions { pub limit: Option, pub offset: Option, - pub attributes_to_retrieve: Option>, + pub retrieve_vectors: bool, + pub fields: Option>, } diff --git a/meilisearch/tests/common/mod.rs b/meilisearch/tests/common/mod.rs index 3117dd185..317e5e171 100644 --- a/meilisearch/tests/common/mod.rs +++ b/meilisearch/tests/common/mod.rs @@ -6,7 +6,7 @@ pub mod service; use std::fmt::{self, Display}; #[allow(unused)] -pub use index::{GetAllDocumentsOptions, GetDocumentOptions}; +pub use index::GetAllDocumentsOptions; use meili_snap::json_string; use serde::{Deserialize, Serialize}; #[allow(unused)] diff --git a/meilisearch/tests/documents/errors.rs b/meilisearch/tests/documents/errors.rs index cd2d89813..8e9a3a696 100644 --- a/meilisearch/tests/documents/errors.rs +++ b/meilisearch/tests/documents/errors.rs @@ -795,3 +795,70 @@ async fn fetch_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + + // GET ALL DOCUMENTS BY QUERY + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=tamo").await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_all_documents_raw("?retrieveVectors=true").await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // FETCH ALL DOCUMENTS BY POST + let (response, _code) = + index.get_document_by_filter(json!({ "retrieveVectors": "tamo" })).await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"tamo\"`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document_by_filter(json!({ "retrieveVectors": true })).await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + + // GET A SINGLE DOCUMENT + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": "tamo"}))).await; + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `tamo` as a boolean, expected either `true` or `false`", + "code": "invalid_document_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_document_retrieve_vectors" + } + "###); + let (response, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); +} diff --git a/meilisearch/tests/documents/get_documents.rs b/meilisearch/tests/documents/get_documents.rs index 3b0629fcb..efe4cf8e9 100644 --- a/meilisearch/tests/documents/get_documents.rs +++ b/meilisearch/tests/documents/get_documents.rs @@ -4,7 +4,7 @@ use meili_snap::*; use urlencoding::encode as urlencode; use crate::common::encoder::Encoder; -use crate::common::{GetAllDocumentsOptions, GetDocumentOptions, Server, Value}; +use crate::common::{GetAllDocumentsOptions, Server, Value}; use crate::json; // TODO: partial test since we are testing error, amd error is not yet fully implemented in @@ -59,8 +59,7 @@ async fn get_document() { }) ); - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["id"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["id"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -69,9 +68,8 @@ async fn get_document() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["nested.content"]) })) - .await; + let (response, code) = + index.get_document(0, Some(json!({ "fields": ["nested.content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -211,7 +209,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name"]), + fields: Some(vec!["name"]), ..Default::default() }) .await; @@ -225,9 +223,19 @@ async fn test_get_all_documents_attributes_to_retrieve() { assert_eq!(response["limit"], json!(20)); assert_eq!(response["total"], json!(77)); + let (response, code) = index.get_all_documents_raw("?fields=").await; + assert_eq!(code, 200); + assert_eq!(response["results"].as_array().unwrap().len(), 20); + for results in response["results"].as_array().unwrap() { + assert_eq!(results.as_object().unwrap().keys().count(), 0); + } + assert_eq!(response["offset"], json!(0)); + assert_eq!(response["limit"], json!(20)); + assert_eq!(response["total"], json!(77)); + let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec![]), + fields: Some(vec!["wrong"]), ..Default::default() }) .await; @@ -242,22 +250,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["wrong"]), - ..Default::default() - }) - .await; - assert_eq!(code, 200); - assert_eq!(response["results"].as_array().unwrap().len(), 20); - for results in response["results"].as_array().unwrap() { - assert_eq!(results.as_object().unwrap().keys().count(), 0); - } - assert_eq!(response["offset"], json!(0)); - assert_eq!(response["limit"], json!(20)); - assert_eq!(response["total"], json!(77)); - - let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["name", "tags"]), + fields: Some(vec!["name", "tags"]), ..Default::default() }) .await; @@ -270,10 +263,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { } let (response, code) = index - .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*"]), - ..Default::default() - }) + .get_all_documents(GetAllDocumentsOptions { fields: Some(vec!["*"]), ..Default::default() }) .await; assert_eq!(code, 200); assert_eq!(response["results"].as_array().unwrap().len(), 20); @@ -283,7 +273,7 @@ async fn test_get_all_documents_attributes_to_retrieve() { let (response, code) = index .get_all_documents(GetAllDocumentsOptions { - attributes_to_retrieve: Some(vec!["*", "wrong"]), + fields: Some(vec!["*", "wrong"]), ..Default::default() }) .await; @@ -316,12 +306,10 @@ async fn get_document_s_nested_attributes_to_retrieve() { assert_eq!(code, 202); index.wait_task(1).await; - let (response, code) = - index.get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!(response, json!({})); - let (response, code) = - index.get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content"]) })).await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -333,9 +321,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { }) ); - let (response, code) = index - .get_document(0, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(0, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -343,9 +329,7 @@ async fn get_document_s_nested_attributes_to_retrieve() { "content.truc": "foobar", }) ); - let (response, code) = index - .get_document(1, Some(GetDocumentOptions { fields: Some(vec!["content.truc"]) })) - .await; + let (response, code) = index.get_document(1, Some(json!({ "fields": ["content.truc"] }))).await; assert_eq!(code, 200); assert_eq!( response, @@ -540,3 +524,207 @@ async fn get_document_by_filter() { } "###); } + +#[actix_rt::test] +async fn get_document_with_vectors() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + // by default you shouldn't see the `_vectors` object + let (documents, _code) = index.get_all_documents(Default::default()).await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, None).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir" + } + "###); + + // if we try to retrieve the vectors with the `fields` parameter they + // still shouldn't be displayed + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + fields: Some(vec!["name", "_vectors"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir" + }, + { + "name": "echo" + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"fields": ["name", "_vectors"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir" + } + "###); + + // If we specify the retrieve vectors boolean and nothing else we should get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = index.get_document(0, Some(json!({"retrieveVectors": true}))).await; + snapshot!(json_string!(documents), @r###" + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + } + "###); + + // If we specify the retrieve vectors boolean and exclude vectors form the `fields` we should still get the vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + retrieve_vectors: true, + fields: Some(vec!["name"]), + ..Default::default() + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + let (documents, _code) = + index.get_document(0, Some(json!({"retrieveVectors": true, "fields": ["name"]}))).await; + snapshot!(json_string!(documents), @r###" + { + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + } + "###); +} diff --git a/meilisearch/tests/dumps/mod.rs b/meilisearch/tests/dumps/mod.rs index c8f8ca105..fa402cb41 100644 --- a/meilisearch/tests/dumps/mod.rs +++ b/meilisearch/tests/dumps/mod.rs @@ -1938,3 +1938,210 @@ async fn import_dump_v6_containing_experimental_features() { }) .await; } + +// In this test we must generate the dump ourselves to ensure the +// `user provided` vectors are well set +#[actix_rt::test] +#[cfg_attr(target_os = "windows", ignore)] +async fn generate_and_import_dump_containing_vectors() { + let temp = tempfile::tempdir().unwrap(); + let mut opt = default_settings(temp.path()); + let server = Server::new_with_options(opt.clone()).await.unwrap(); + let (code, _) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + let index = server.index("pets"); + let (response, code) = index + .update_settings(json!( + { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}", + } + } + } + )) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + let (response, code) = index + .add_documents( + json!([ + {"id": 0, "doggo": "kefir", "_vectors": { "doggo_embedder": vec![0; 384] }}, + {"id": 1, "doggo": "echo", "_vectors": { "doggo_embedder": { "regenerate": false, "embeddings": vec![1; 384] }}}, + {"id": 2, "doggo": "intel", "_vectors": { "doggo_embedder": { "regenerate": true, "embeddings": vec![2; 384] }}}, + {"id": 3, "doggo": "bill", "_vectors": { "doggo_embedder": { "regenerate": true }}}, + {"id": 4, "doggo": "max" }, + ]), + None, + ) + .await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response); + + let (response, code) = server.create_dump().await; + snapshot!(code, @"202 Accepted"); + let response = index.wait_task(response.uid()).await; + snapshot!(response["status"], @r###""succeeded""###); + + // ========= We made a dump, now we should clear the DB and try to import our dump + drop(server); + tokio::fs::remove_dir_all(&opt.db_path).await.unwrap(); + let dump_name = format!("{}.dump", response["details"]["dumpUid"].as_str().unwrap()); + let dump_path = opt.dump_dir.join(dump_name); + assert!(dump_path.exists(), "path: `{}`", dump_path.display()); + + opt.import_dump = Some(dump_path); + // NOTE: We shouldn't have to change the database path but I lost one hour + // because of a « bad path » error and that fixed it. + opt.db_path = temp.path().join("data.ms"); + + let mut server = Server::new_auth_with_options(opt, temp).await; + server.use_api_key("MASTER_KEY"); + + let (indexes, code) = server.list_indexes(None, None).await; + assert_eq!(code, 200, "{indexes}"); + + snapshot!(indexes["results"].as_array().unwrap().len(), @"1"); + snapshot!(indexes["results"][0]["uid"], @r###""pets""###); + snapshot!(indexes["results"][0]["primaryKey"], @r###""id""###); + + let (response, code) = server.get_features().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let index = server.index("pets"); + + let (response, code) = index.settings().await; + meili_snap::snapshot!(code, @"200 OK"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "displayedAttributes": [ + "*" + ], + "searchableAttributes": [ + "*" + ], + "filterableAttributes": [], + "sortableAttributes": [], + "rankingRules": [ + "words", + "typo", + "proximity", + "attribute", + "sort", + "exactness" + ], + "stopWords": [], + "nonSeparatorTokens": [], + "separatorTokens": [], + "dictionary": [], + "synonyms": {}, + "distinctAttribute": null, + "proximityPrecision": "byWord", + "typoTolerance": { + "enabled": true, + "minWordSizeForTypos": { + "oneTypo": 5, + "twoTypos": 9 + }, + "disableOnWords": [], + "disableOnAttributes": [] + }, + "faceting": { + "maxValuesPerFacet": 100, + "sortFacetValuesBy": { + "*": "alpha" + } + }, + "pagination": { + "maxTotalHits": 1000 + }, + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + }, + "searchCutoffMs": null + } + "###); + + index + .search(json!({"retrieveVectors": true}), |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"], { "[]._vectors.doggo_embedder.embeddings" => "[vector]" }), @r###" + [ + { + "id": 0, + "doggo": "kefir", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": false + } + } + }, + { + "id": 1, + "doggo": "echo", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": false + } + } + }, + { + "id": 2, + "doggo": "intel", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 3, + "doggo": "bill", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": true + } + } + }, + { + "id": 4, + "doggo": "max", + "_vectors": { + "doggo_embedder": { + "embeddings": "[vector]", + "regenerate": true + } + } + } + ] + "###); + }) + .await; +} diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap new file mode 100644 index 000000000..4b05d417a --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/1.snap @@ -0,0 +1,25 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 0, + "indexUid": "pets", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "doggo_embedder": { + "source": "huggingFace", + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "documentTemplate": "{{doc.doggo}}" + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap new file mode 100644 index 000000000..43971924b --- /dev/null +++ b/meilisearch/tests/dumps/snapshots/mod.rs/generate_and_import_dump_containing_vectors/2.snap @@ -0,0 +1,19 @@ +--- +source: meilisearch/tests/dumps/mod.rs +--- +{ + "uid": 1, + "indexUid": "pets", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 5, + "indexedDocuments": 5 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" +} diff --git a/meilisearch/tests/integration.rs b/meilisearch/tests/integration.rs index bb77ecc63..78da9825a 100644 --- a/meilisearch/tests/integration.rs +++ b/meilisearch/tests/integration.rs @@ -13,6 +13,7 @@ mod snapshot; mod stats; mod swap_indexes; mod tasks; +mod vector; // Tests are isolated by features in different modules to allow better readability, test // targetability, and improved incremental compilation times. diff --git a/meilisearch/tests/search/errors.rs b/meilisearch/tests/search/errors.rs index 53d516c44..75977b190 100644 --- a/meilisearch/tests/search/errors.rs +++ b/meilisearch/tests/search/errors.rs @@ -167,6 +167,74 @@ async fn search_bad_hits_per_page() { "###); } +#[actix_rt::test] +async fn search_bad_attributes_to_retrieve() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"attributesToRetrieve": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.attributesToRetrieve`: expected an array, but found a string: `\"doggo\"`", + "code": "invalid_search_attributes_to_retrieve", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_attributes_to_retrieve" + } + "###); + // Can't make the `attributes_to_retrieve` fail with a get search since it'll accept anything as an array of strings. +} + +#[actix_rt::test] +async fn search_bad_retrieve_vectors() { + let server = Server::new().await; + let index = server.index("test"); + + let (response, code) = index.search_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); + + let (response, code) = index.search_get("retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_search_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_search_retrieve_vectors" + } + "###); +} + #[actix_rt::test] async fn search_bad_attributes_to_crop() { let server = Server::new().await; diff --git a/meilisearch/tests/search/hybrid.rs b/meilisearch/tests/search/hybrid.rs index 9c50df6e1..31b2940d8 100644 --- a/meilisearch/tests/search/hybrid.rs +++ b/meilisearch/tests/search/hybrid.rs @@ -124,29 +124,29 @@ async fn simple_search() { let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}}},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.5}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"2"); let (response, code) = index .search_post( - json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true}), + json!({"q": "Captain", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -204,10 +204,10 @@ async fn distribution_shift() { let server = Server::new().await; let index = index_with_documents_user_provided(&server, &SIMPLE_SEARCH_DOCUMENTS_VEC).await; - let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}}); + let search = json!({"q": "Captain", "vector": [1.0, 1.0], "showRankingScore": true, "hybrid": {"semanticRatio": 1.0}, "retrieveVectors": true}); let (response, code) = index.search_post(search.clone()).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9472135901451112}]"###); let (response, code) = index .update_settings(json!({ @@ -228,7 +228,7 @@ async fn distribution_shift() { let (response, code) = index.search_post(search).await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.1920928955078125e-7}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.19161224365234375},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.1920928955078125e-7}]"###); } #[actix_rt::test] @@ -239,20 +239,23 @@ async fn highlighter() { let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.2}, - "attributesToHighlight": [ - "desc" + "retrieveVectors": true, + "attributesToHighlight": [ + "desc", + "_vectors", ], - "highlightPreTag": "**BEGIN**", - "highlightPostTag": "**END**" + "highlightPreTag": "**BEGIN**", + "highlightPostTag": "**END**", })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}}}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"}},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"}},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"}}]"###); snapshot!(response["semanticHitCount"], @"0"); let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 0.8}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -262,13 +265,14 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the **BEGIN**Marvel**END** Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a **BEGIN**Captain**END** **BEGIN**Marvel**END** ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); // no highlighting on full semantic let (response, code) = index .search_post(json!({"q": "Captain Marvel", "vector": [1.0, 1.0], "hybrid": {"semanticRatio": 1.0}, + "retrieveVectors": true, "showRankingScore": true, "attributesToHighlight": [ "desc" @@ -278,7 +282,7 @@ async fn highlighter() { })) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":["2.0","3.0"]}},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":["1.0","2.0"]}},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":["1.0","3.0"]}},"_rankingScore":0.9472135901451112}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_formatted":{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3"},"_rankingScore":0.990290343761444},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_formatted":{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2"},"_rankingScore":0.974341630935669},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_formatted":{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1"},"_rankingScore":0.9472135901451112}]"###); snapshot!(response["semanticHitCount"], @"3"); } @@ -361,12 +365,12 @@ async fn single_document() { let (response, code) = index .search_post( - json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"vector": [1.0, 3.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0}"###); + snapshot!(response["hits"][0], @r###"{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0}"###); snapshot!(response["semanticHitCount"], @"1"); } @@ -377,25 +381,25 @@ async fn query_combination() { // search without query and vector, but with hybrid => still placeholder let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // same with a different semantic ratio let (response, code) = index - .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true})) + .search_post(json!({"hybrid": {"semanticRatio": 0.76}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // wrong vector dimensions let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0, 1.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -410,34 +414,34 @@ async fn query_combination() { // full vector let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.6581138968467712}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.7773500680923462},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.7236068248748779},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.6581138968467712}]"###); snapshot!(response["semanticHitCount"], @"3"); // full keyword, without a query let (response, code) = index - .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":1.0}]"###); + snapshot!(response["hits"], @r###"[{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":1.0},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":1.0}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, full keyword => keyword let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "hybrid": {"semanticRatio": 0.0}, "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":[2.0,3.0]},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":[1.0,3.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Captain Marvel","desc":"a Shazam ersatz","id":"3","_vectors":{"default":{"embeddings":[[2.0,3.0]],"regenerate":false}},"_rankingScore":0.9848484848484848},{"title":"Shazam!","desc":"a Captain Marvel ersatz","id":"1","_vectors":{"default":{"embeddings":[[1.0,3.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"null"); // query + vector, no hybrid keyword => let (response, code) = index - .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true})) + .search_post(json!({"q": "Captain", "vector": [1.0, 0.0], "showRankingScore": true, "retrieveVectors": true})) .await; snapshot!(code, @"400 Bad Request"); @@ -453,7 +457,7 @@ async fn query_combination() { // full vector, without a vector => error let (response, code) = index .search_post( - json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true}), + json!({"q": "Captain", "hybrid": {"semanticRatio": 1.0}, "showRankingScore": true, "retrieveVectors": true}), ) .await; @@ -470,11 +474,93 @@ async fn query_combination() { // hybrid without a vector => full keyword let (response, code) = index .search_post( - json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true}), + json!({"q": "Planet", "hybrid": {"semanticRatio": 0.99}, "showRankingScore": true, "retrieveVectors": true}), ) .await; snapshot!(code, @"200 OK"); - snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":[1.0,2.0]},"_rankingScore":0.9242424242424242}]"###); + snapshot!(response["hits"], @r###"[{"title":"Captain Planet","desc":"He's not part of the Marvel Cinematic Universe","id":"2","_vectors":{"default":{"embeddings":[[1.0,2.0]],"regenerate":false}},"_rankingScore":0.9242424242424242}]"###); snapshot!(response["semanticHitCount"], @"0"); } + +#[actix_rt::test] +async fn retrieve_vectors() { + let server = Server::new().await; + let index = index_with_documents_hf(&server, &SIMPLE_SEARCH_DOCUMENTS).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1", + "_vectors": { + "default": { + "embeddings": "[vectors]", + "regenerate": true + } + } + } + ] + "###); + + // remove `_vectors` from displayed attributes + let (response, code) = + index.update_settings(json!({ "displayedAttributes": ["id", "title", "desc"]} )).await; + assert_eq!(202, code, "{:?}", response); + index.wait_task(response.uid()).await; + + let (response, code) = index + .search_post( + json!({"q": "Captain", "hybrid": {"semanticRatio": 0.2}, "retrieveVectors": true}), + ) + .await; + snapshot!(code, @"200 OK"); + insta::assert_json_snapshot!(response["hits"], {"[]._vectors.default.embeddings" => "[vectors]"}, @r###" + [ + { + "title": "Captain Planet", + "desc": "He's not part of the Marvel Cinematic Universe", + "id": "2" + }, + { + "title": "Captain Marvel", + "desc": "a Shazam ersatz", + "id": "3" + }, + { + "title": "Shazam!", + "desc": "a Captain Marvel ersatz", + "id": "1" + } + ] + "###); +} diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index e80c5144d..e239ff767 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -1290,21 +1290,38 @@ async fn experimental_feature_vector_store() { index.add_documents(json!(documents), None).await; index.wait_task(0).await; - let (response, code) = index - .search_post(json!({ + index + .search(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true - })) + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `vector` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) + .await; + index + .search(json!({ + "retrieveVectors": true, + "showRankingScore": true + }), |response, code|{ + meili_snap::snapshot!(code, @"400 Bad Request"); + meili_snap::snapshot!(meili_snap::json_string!(response), @r###" + { + "message": "Passing `retrieveVectors` as a parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", + "code": "feature_not_enabled", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#feature_not_enabled" + } + "###); + }) .await; - meili_snap::snapshot!(code, @"400 Bad Request"); - meili_snap::snapshot!(meili_snap::json_string!(response), @r###" - { - "message": "Passing `vector` as a query parameter requires enabling the `vector store` experimental feature. See https://github.com/meilisearch/product/discussions/677", - "code": "feature_not_enabled", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#feature_not_enabled" - } - "###); let (response, code) = server.set_features(json!({"vectorStore": true})).await; meili_snap::snapshot!(code, @"200 OK"); @@ -1337,6 +1354,7 @@ async fn experimental_feature_vector_store() { .search_post(json!({ "vector": [1.0, 2.0, 3.0], "showRankingScore": true, + "retrieveVectors": true, })) .await; @@ -1348,11 +1366,16 @@ async fn experimental_feature_vector_store() { "title": "Shazam!", "id": "287947", "_vectors": { - "manual": [ - 1.0, - 2.0, - 3.0 - ] + "manual": { + "embeddings": [ + [ + 1.0, + 2.0, + 3.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 1.0 }, @@ -1360,11 +1383,16 @@ async fn experimental_feature_vector_store() { "title": "Captain Marvel", "id": "299537", "_vectors": { - "manual": [ - 1.0, - 2.0, - 54.0 - ] + "manual": { + "embeddings": [ + [ + 1.0, + 2.0, + 54.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.9129111766815186 }, @@ -1372,11 +1400,16 @@ async fn experimental_feature_vector_store() { "title": "Gläss", "id": "450465", "_vectors": { - "manual": [ - -100.0, - 340.0, - 90.0 - ] + "manual": { + "embeddings": [ + [ + -100.0, + 340.0, + 90.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.8106412887573242 }, @@ -1384,11 +1417,16 @@ async fn experimental_feature_vector_store() { "title": "How to Train Your Dragon: The Hidden World", "id": "166428", "_vectors": { - "manual": [ - -100.0, - 231.0, - 32.0 - ] + "manual": { + "embeddings": [ + [ + -100.0, + 231.0, + 32.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.7412010431289673 }, @@ -1396,11 +1434,16 @@ async fn experimental_feature_vector_store() { "title": "Escape Room", "id": "522681", "_vectors": { - "manual": [ - 10.0, - -23.0, - 32.0 - ] + "manual": { + "embeddings": [ + [ + 10.0, + -23.0, + 32.0 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.6972063183784485 } diff --git a/meilisearch/tests/similar/errors.rs b/meilisearch/tests/similar/errors.rs index 7765b9a85..546554882 100644 --- a/meilisearch/tests/similar/errors.rs +++ b/meilisearch/tests/similar/errors.rs @@ -756,3 +756,54 @@ async fn filter_reserved_geo_point_string() { }) .await; } + +#[actix_rt::test] +async fn similar_bad_retrieve_vectors() { + let server = Server::new().await; + server.set_features(json!({"vectorStore": true})).await; + let index = server.index("test"); + + let (response, code) = index.similar_post(json!({"retrieveVectors": "doggo"})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found a string: `\"doggo\"`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_post(json!({"retrieveVectors": [true]})).await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value type at `.retrieveVectors`: expected a boolean, but found an array: `[true]`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("retrieveVectors=").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); + + let (response, code) = index.similar_get("retrieveVectors=doggo").await; + snapshot!(code, @"400 Bad Request"); + snapshot!(json_string!(response), @r###" + { + "message": "Invalid value in parameter `retrieveVectors`: could not parse `doggo` as a boolean, expected either `true` or `false`", + "code": "invalid_similar_retrieve_vectors", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_similar_retrieve_vectors" + } + "###); +} diff --git a/meilisearch/tests/similar/mod.rs b/meilisearch/tests/similar/mod.rs index bde23b67f..60a0203ed 100644 --- a/meilisearch/tests/similar/mod.rs +++ b/meilisearch/tests/similar/mod.rs @@ -78,7 +78,7 @@ async fn basic() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143}), |response, code| { + .similar(json!({"id": 143, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -87,11 +87,16 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } } }, { @@ -99,11 +104,16 @@ async fn basic() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } } }, { @@ -111,11 +121,16 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } } }, { @@ -123,11 +138,16 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } } } ] @@ -136,7 +156,7 @@ async fn basic() { .await; index - .similar(json!({"id": "299537"}), |response, code| { + .similar(json!({"id": "299537", "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -145,11 +165,16 @@ async fn basic() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } } }, { @@ -157,11 +182,16 @@ async fn basic() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } } }, { @@ -169,11 +199,16 @@ async fn basic() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } } }, { @@ -181,11 +216,16 @@ async fn basic() { "release_year": 1930, "id": "143", "_vectors": { - "manual": [ - -0.5, - 0.3, - 0.85 - ] + "manual": { + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "regenerate": false + } } } ] @@ -228,7 +268,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"4"); @@ -239,11 +279,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.890957772731781 }, @@ -252,11 +297,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.39060014486312866 }, @@ -265,11 +315,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.2819308042526245 }, @@ -278,11 +333,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "287947", "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.1662663221359253 } @@ -294,7 +354,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.2, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"3"); @@ -305,11 +365,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.890957772731781 }, @@ -318,11 +383,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.39060014486312866 }, @@ -331,11 +401,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "166428", "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.2819308042526245 } @@ -347,7 +422,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.3, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"2"); @@ -358,11 +433,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.890957772731781 }, @@ -371,11 +451,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "299537", "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.39060014486312866 } @@ -387,7 +472,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.6, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); meili_snap::snapshot!(meili_snap::json_string!(response["estimatedTotalHits"]), @"1"); @@ -398,11 +483,16 @@ async fn ranking_score_threshold() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } }, "_rankingScore": 0.890957772731781 } @@ -414,7 +504,7 @@ async fn ranking_score_threshold() { index .similar( - json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9}), + json!({"id": 143, "showRankingScore": true, "rankingScoreThreshold": 0.9, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @"[]"); @@ -456,71 +546,97 @@ async fn filter() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 522681, "filter": "release_year = 2019"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - }, - { - "title": "How to Train Your Dragon: The Hidden World", - "release_year": 2019, - "id": "166428", - "_vectors": { - "manual": [ - 0.7, - 0.7, - -0.4 - ] - } - }, - { - "title": "Shazam!", - "release_year": 2019, - "id": "287947", - "_vectors": { - "manual": [ - 0.8, - 0.4, - -0.5 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year = 2019", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } + } + }, + { + "title": "How to Train Your Dragon: The Hidden World", + "release_year": 2019, + "id": "166428", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.699999988079071, + 0.699999988079071, + -0.4000000059604645 + ] + ], + "regenerate": false + } + } + }, + { + "title": "Shazam!", + "release_year": 2019, + "id": "287947", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.800000011920929, + 0.4000000059604645, + -0.5 + ] + ], + "regenerate": false + } + } + } + ] + "###); + }, + ) .await; index - .similar(json!({"id": 522681, "filter": "release_year < 2000"}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "All Quiet on the Western Front", - "release_year": 1930, - "id": "143", - "_vectors": { - "manual": [ - -0.5, - 0.3, - 0.85 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 522681, "filter": "release_year < 2000", "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "All Quiet on the Western Front", + "release_year": 1930, + "id": "143", + "_vectors": { + "manual": { + "embeddings": [ + [ + -0.5, + 0.30000001192092896, + 0.8500000238418579 + ] + ], + "regenerate": false + } + } + } + ] + "###); + }, + ) .await; } @@ -557,7 +673,7 @@ async fn limit_and_offset() { index.wait_task(value.uid()).await; index - .similar(json!({"id": 143, "limit": 1}), |response, code| { + .similar(json!({"id": 143, "limit": 1, "retrieveVectors": true}), |response, code| { snapshot!(code, @"200 OK"); snapshot!(json_string!(response["hits"]), @r###" [ @@ -566,11 +682,16 @@ async fn limit_and_offset() { "release_year": 2019, "id": "522681", "_vectors": { - "manual": [ - 0.1, - 0.6, - 0.8 - ] + "manual": { + "embeddings": [ + [ + 0.10000000149011612, + 0.6000000238418579, + 0.800000011920929 + ] + ], + "regenerate": false + } } } ] @@ -579,24 +700,32 @@ async fn limit_and_offset() { .await; index - .similar(json!({"id": 143, "limit": 1, "offset": 1}), |response, code| { - snapshot!(code, @"200 OK"); - snapshot!(json_string!(response["hits"]), @r###" - [ - { - "title": "Captain Marvel", - "release_year": 2019, - "id": "299537", - "_vectors": { - "manual": [ - 0.6, - 0.8, - -0.2 - ] - } - } - ] - "###); - }) + .similar( + json!({"id": 143, "limit": 1, "offset": 1, "retrieveVectors": true}), + |response, code| { + snapshot!(code, @"200 OK"); + snapshot!(json_string!(response["hits"]), @r###" + [ + { + "title": "Captain Marvel", + "release_year": 2019, + "id": "299537", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.6000000238418579, + 0.800000011920929, + -0.20000000298023224 + ] + ], + "regenerate": false + } + } + } + ] + "###); + }, + ) .await; } diff --git a/meilisearch/tests/vector/mod.rs b/meilisearch/tests/vector/mod.rs new file mode 100644 index 000000000..8d619a15a --- /dev/null +++ b/meilisearch/tests/vector/mod.rs @@ -0,0 +1,227 @@ +mod settings; + +use meili_snap::{json_string, snapshot}; + +use crate::common::index::Index; +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; + +#[actix_rt::test] +async fn add_remove_user_provided() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [10, 10, 10] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 10.0, + 10.0, + 10.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 2 + } + "###); + + let (value, code) = index.delete_document(0).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 1, + "name": "echo", + "_vectors": {} + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); +} + +async fn generate_default_user_provided_documents(server: &Server) -> Index { + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1] }}, + {"id": 2, "name": "billou", "_vectors": { "manual": [[2, 2, 2], [2, 2, 3]] }}, + {"id": 3, "name": "intel", "_vectors": { "manual": { "regenerate": false, "embeddings": [3, 3, 3] }}}, + {"id": 4, "name": "max", "_vectors": { "manual": { "regenerate": false, "embeddings": [[4, 4, 4], [4, 4, 5]] }}}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(value.uid()).await; + + index +} + +#[actix_rt::test] +async fn clear_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (value, _code) = index.clear_all_documents().await; + index.wait_task(value.uid()).await; + + // Make sure the documents DB has been cleared + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [], + "offset": 0, + "limit": 20, + "total": 0 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "hits": [], + "query": "", + "processingTimeMs": 0, + "limit": 20, + "offset": 0, + "estimatedTotalHits": 0, + "semanticHitCount": 0 + } + "###); +} diff --git a/meilisearch/tests/vector/settings.rs b/meilisearch/tests/vector/settings.rs new file mode 100644 index 000000000..e53ceb383 --- /dev/null +++ b/meilisearch/tests/vector/settings.rs @@ -0,0 +1,228 @@ +use meili_snap::{json_string, snapshot}; + +use crate::common::{GetAllDocumentsOptions, Server}; +use crate::json; +use crate::vector::generate_default_user_provided_documents; + +#[actix_rt::test] +async fn update_embedder() { + let server = Server::new().await; + let index = server.index("doggo"); + let (value, code) = server.set_features(json!({"vectorStore": true})).await; + snapshot!(code, @"200 OK"); + snapshot!(value, @r###" + { + "vectorStore": true, + "metrics": false, + "logsRoute": false + } + "###); + + let (response, code) = index + .update_settings(json!({ + "embedders": { "manual": {}}, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + + let ret = server.wait_task(response.uid()).await; + snapshot!(ret, @r###" + { + "uid": 1, + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 2 + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +} + +#[actix_rt::test] +async fn reset_embedder_documents() { + let server = Server::new().await; + let index = generate_default_user_provided_documents(&server).await; + + let (response, code) = index.delete_settings().await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await; + + // Make sure the documents are still present + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { + limit: None, + offset: None, + retrieve_vectors: false, + fields: None, + }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir" + }, + { + "id": 1, + "name": "echo" + }, + { + "id": 2, + "name": "billou" + }, + { + "id": 3, + "name": "intel" + }, + { + "id": 4, + "name": "max" + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure we are still able to retrieve their vectors + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 1, + "name": "echo", + "_vectors": { + "manual": { + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 2, + "name": "billou", + "_vectors": { + "manual": { + "embeddings": [ + [ + 2.0, + 2.0, + 2.0 + ], + [ + 2.0, + 2.0, + 3.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 3, + "name": "intel", + "_vectors": { + "manual": { + "embeddings": [ + [ + 3.0, + 3.0, + 3.0 + ] + ], + "regenerate": false + } + } + }, + { + "id": 4, + "name": "max", + "_vectors": { + "manual": { + "embeddings": [ + [ + 4.0, + 4.0, + 4.0 + ], + [ + 4.0, + 4.0, + 5.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 5 + } + "###); + + // Make sure the arroy DB has been cleared + let (documents, _code) = index.search_post(json!({ "vector": [1, 1, 1] })).await; + snapshot!(json_string!(documents), @r###" + { + "message": "Cannot find embedder with name `default`.", + "code": "invalid_embedder", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_embedder" + } + "###); +} diff --git a/milli/Cargo.toml b/milli/Cargo.toml index f23694d10..a4aa4ef95 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -44,7 +44,7 @@ once_cell = "1.19.0" ordered-float = "4.2.0" rand_pcg = { version = "0.3.1", features = ["serde1"] } rayon = "1.8.0" -roaring = "0.10.2" +roaring = { version = "0.10.2", features = ["serde"] } rstar = { version = "0.11.0", features = ["serde"] } serde = { version = "1.0.195", features = ["derive"] } serde_json = { version = "1.0.111", features = ["preserve_order"] } @@ -71,10 +71,10 @@ csv = "1.3.0" candle-core = { version = "0.4.1" } candle-transformers = { version = "0.4.1" } candle-nn = { version = "0.4.1" } -tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default_features = false, features = [ +tokenizers = { git = "https://github.com/huggingface/tokenizers.git", tag = "v0.15.2", version = "0.15.2", default-features = false, features = [ "onig", ] } -hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default_features = false, features = [ +hf-hub = { git = "https://github.com/dureuill/hf-hub.git", branch = "rust_tls", default-features = false, features = [ "online", ] } tiktoken-rs = "0.5.8" diff --git a/milli/src/fieldids_weights_map.rs b/milli/src/fieldids_weights_map.rs index a737632a4..13f2f8afc 100644 --- a/milli/src/fieldids_weights_map.rs +++ b/milli/src/fieldids_weights_map.rs @@ -4,6 +4,7 @@ use std::collections::HashMap; use serde::{Deserialize, Serialize}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::{FieldId, FieldsIdsMap, Weight}; #[derive(Debug, Default, Serialize, Deserialize)] @@ -23,7 +24,13 @@ impl FieldidsWeightsMap { /// Should only be called in the case there are NO searchable attributes. /// All the fields will be inserted in the order of the fields ids map with a weight of 0. pub fn from_field_id_map_without_searchable(fid_map: &FieldsIdsMap) -> Self { - FieldidsWeightsMap { map: fid_map.ids().map(|fid| (fid, 0)).collect() } + FieldidsWeightsMap { + map: fid_map + .iter() + .filter(|(_fid, name)| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) + .map(|(fid, _name)| (fid, 0)) + .collect(), + } } /// Removes a field id from the map, returning the associated weight previously in the map. diff --git a/milli/src/fields_ids_map.rs b/milli/src/fields_ids_map.rs index 9c1c87f82..f9d7c3704 100644 --- a/milli/src/fields_ids_map.rs +++ b/milli/src/fields_ids_map.rs @@ -41,6 +41,16 @@ impl FieldsIdsMap { } } + /// Get the ids of a field and all its nested fields based on its name. + pub fn nested_ids(&self, name: &str) -> Vec { + self.names_ids + .range(name.to_string()..) + .take_while(|(key, _)| key.starts_with(name)) + .filter(|(key, _)| crate::is_faceted_by(key, name)) + .map(|(_name, id)| *id) + .collect() + } + /// Get the id of a field based on its name. pub fn id(&self, name: &str) -> Option { self.names_ids.get(name).copied() @@ -126,4 +136,32 @@ mod tests { assert_eq!(iter.next(), Some((3, "title"))); assert_eq!(iter.next(), None); } + + #[test] + fn nested_fields() { + let mut map = FieldsIdsMap::new(); + + assert_eq!(map.insert("id"), Some(0)); + assert_eq!(map.insert("doggo"), Some(1)); + assert_eq!(map.insert("doggo.name"), Some(2)); + assert_eq!(map.insert("doggolution"), Some(3)); + assert_eq!(map.insert("doggo.breed.name"), Some(4)); + assert_eq!(map.insert("description"), Some(5)); + + insta::assert_debug_snapshot!(map.nested_ids("doggo"), @r###" + [ + 1, + 4, + 2, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("doggo.breed"), @r###" + [ + 4, + ] + "###); + + insta::assert_debug_snapshot!(map.nested_ids("_vector"), @"[]"); + } } diff --git a/milli/src/index.rs b/milli/src/index.rs index 3c502d541..d325d6fa4 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -9,6 +9,7 @@ use heed::types::*; use heed::{CompactionOption, Database, RoTxn, RwTxn, Unspecified}; use roaring::RoaringBitmap; use rstar::RTree; +use serde::{Deserialize, Serialize}; use time::OffsetDateTime; use crate::documents::PrimaryKey; @@ -23,6 +24,7 @@ use crate::heed_codec::{ }; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::{Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, @@ -644,6 +646,7 @@ impl Index { &self, wtxn: &mut RwTxn, user_fields: &[&str], + non_searchable_fields_ids: &[FieldId], fields_ids_map: &FieldsIdsMap, ) -> Result<()> { // We can write the user defined searchable fields as-is. @@ -662,6 +665,7 @@ impl Index { for (weight, user_field) in user_fields.iter().enumerate() { if crate::is_faceted_by(field_from_map, user_field) && !real_fields.contains(&field_from_map) + && !non_searchable_fields_ids.contains(&id) { real_fields.push(field_from_map); @@ -708,6 +712,7 @@ impl Index { Ok(self .fields_ids_map(rtxn)? .names() + .filter(|name| !crate::is_faceted_by(name, RESERVED_VECTORS_FIELD_NAME)) .map(|field| Cow::Owned(field.to_string())) .collect()) }) @@ -1568,12 +1573,16 @@ impl Index { Ok(script_language) } + /// Put the embedding configs: + /// 1. The name of the embedder + /// 2. The configuration option for this embedder + /// 3. The list of documents with a user provided embedding pub(crate) fn put_embedding_configs( &self, wtxn: &mut RwTxn<'_>, - configs: Vec<(String, EmbeddingConfig)>, + configs: Vec, ) -> heed::Result<()> { - self.main.remap_types::>>().put( + self.main.remap_types::>>().put( wtxn, main_key::EMBEDDING_CONFIGS, &configs, @@ -1584,13 +1593,10 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) } - pub fn embedding_configs( - &self, - rtxn: &RoTxn<'_>, - ) -> Result> { + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { Ok(self .main - .remap_types::>>() + .remap_types::>>() .get(rtxn, main_key::EMBEDDING_CONFIGS)? .unwrap_or_default()) } @@ -1662,6 +1668,13 @@ impl Index { } } +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + pub user_provided: RoaringBitmap, +} + #[cfg(test)] pub(crate) mod tests { use std::collections::HashSet; @@ -1669,15 +1682,17 @@ pub(crate) mod tests { use big_s::S; use heed::{EnvOpenOptions, RwTxn}; - use maplit::hashset; + use maplit::{btreemap, hashset}; use tempfile::TempDir; use crate::documents::DocumentsBatchReader; use crate::error::{Error, InternalError}; use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; use crate::update::{ - self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, + self, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, + Settings, }; + use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; use crate::{db_snap, obkv_to_json, Filter, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -2783,4 +2798,95 @@ pub(crate) mod tests { ] "###); } + + #[test] + fn vectors_are_never_indexed_as_searchable_or_filterable() { + let index = TempIndex::new(); + + index + .add_documents(documents!([ + { "id": 0, "_vectors": { "doggo": [2345] } }, + { "id": 1, "_vectors": { "doggo": [6789] } }, + ])) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @r###"["id"]"###); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + 0 0 | + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + drop(rtxn); + + index + .update_settings(|settings| { + settings.set_searchable_fields(vec![S("_vectors"), S("_vectors.doggo")]); + settings.set_filterable_fields(hashset![S("_vectors"), S("_vectors.doggo")]); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + + index + .update_settings(|settings| { + settings.set_embedder_settings(btreemap! { + S("doggo") => Setting::Set(EmbeddingSettings { + dimensions: Setting::Set(1), + source: Setting::Set(EmbedderSource::UserProvided), + ..EmbeddingSettings::default()}), + }); + }) + .unwrap(); + + db_snap!(index, fields_ids_map, @r###" + 0 id | + 1 _vectors | + 2 _vectors.doggo | + "###); + db_snap!(index, searchable_fields, @"[]"); + db_snap!(index, fieldids_weights_map, @r###" + fid weight + "###); + + let rtxn = index.read_txn().unwrap(); + let mut search = index.search(&rtxn); + let results = search.query("2345").execute().unwrap(); + assert!(results.candidates.is_empty()); + + let mut search = index.search(&rtxn); + let results = search + .filter(Filter::from_str("_vectors.doggo = 6789").unwrap().unwrap()) + .execute() + .unwrap(); + assert!(results.candidates.is_empty()); + } } diff --git a/milli/src/search/new/logger/visual.rs b/milli/src/search/new/logger/visual.rs index 8df56da89..2bffdd8d9 100644 --- a/milli/src/search/new/logger/visual.rs +++ b/milli/src/search/new/logger/visual.rs @@ -22,7 +22,7 @@ pub enum SearchEvents { RankingRuleStartIteration { ranking_rule_idx: usize, universe_len: u64 }, RankingRuleNextBucket { ranking_rule_idx: usize, universe_len: u64, bucket_len: u64 }, RankingRuleSkipBucket { ranking_rule_idx: usize, bucket_len: u64 }, - RankingRuleEndIteration { ranking_rule_idx: usize, universe_len: u64 }, + RankingRuleEndIteration { ranking_rule_idx: usize }, ExtendResults { new: Vec }, ProximityGraph { graph: RankingRuleGraph }, ProximityPaths { paths: Vec>> }, @@ -123,12 +123,9 @@ impl SearchLogger for VisualSearchLogger { &mut self, ranking_rule_idx: usize, _ranking_rule: &dyn RankingRule, - universe: &RoaringBitmap, + _universe: &RoaringBitmap, ) { - self.events.push(SearchEvents::RankingRuleEndIteration { - ranking_rule_idx, - universe_len: universe.len(), - }); + self.events.push(SearchEvents::RankingRuleEndIteration { ranking_rule_idx }); self.location.pop(); } fn add_to_results(&mut self, docids: &[u32]) { @@ -326,7 +323,7 @@ impl<'ctx> DetailedLoggerFinish<'ctx> { assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); self.write_skip_bucket(bucket_len)?; } - SearchEvents::RankingRuleEndIteration { ranking_rule_idx, universe_len: _ } => { + SearchEvents::RankingRuleEndIteration { ranking_rule_idx } => { assert!(ranking_rule_idx == self.rr_action_counter.len() - 1); self.write_end_iteration()?; } diff --git a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap b/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap deleted file mode 100644 index 930a21626..000000000 --- a/milli/src/search/new/tests/snapshots/milli__search__new__tests__attribute_fid__attribute_fid_ngrams.snap +++ /dev/null @@ -1,244 +0,0 @@ ---- -source: milli/src/search/new/tests/attribute_fid.rs -expression: "format!(\"{document_ids_scores:#?}\")" ---- -[ - ( - 2, - [ - Fid( - Rank { - rank: 19, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), - ( - 6, - [ - Fid( - Rank { - rank: 15, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 5, - [ - Fid( - Rank { - rank: 14, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 4, - [ - Fid( - Rank { - rank: 13, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 3, - [ - Fid( - Rank { - rank: 12, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 83, - max_rank: 91, - }, - ), - ], - ), - ( - 9, - [ - Fid( - Rank { - rank: 11, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 8, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 79, - max_rank: 91, - }, - ), - ], - ), - ( - 7, - [ - Fid( - Rank { - rank: 10, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 73, - max_rank: 91, - }, - ), - ], - ), - ( - 11, - [ - Fid( - Rank { - rank: 7, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 77, - max_rank: 91, - }, - ), - ], - ), - ( - 10, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 13, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 81, - max_rank: 91, - }, - ), - ], - ), - ( - 12, - [ - Fid( - Rank { - rank: 6, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 78, - max_rank: 91, - }, - ), - ], - ), - ( - 14, - [ - Fid( - Rank { - rank: 5, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 75, - max_rank: 91, - }, - ), - ], - ), - ( - 0, - [ - Fid( - Rank { - rank: 1, - max_rank: 19, - }, - ), - Position( - Rank { - rank: 91, - max_rank: 91, - }, - ), - ], - ), -] diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/1/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap b/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap deleted file mode 100644 index 1d1d629e6..000000000 --- a/milli/src/snapshots/index.rs/initial_field_distribution/field_distribution.snap +++ /dev/null @@ -1,7 +0,0 @@ ---- -source: milli/src/index.rs ---- -age 1 | -id 2 | -name 2 | - diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 3490b55e4..9eca378a5 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -64,6 +64,13 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.delete_geo_rtree(self.wtxn)?; self.index.delete_geo_faceted_documents_ids(self.wtxn)?; + // Remove all user-provided bits from the configs + let mut configs = self.index.embedding_configs(self.wtxn)?; + for config in configs.iter_mut() { + config.user_provided.clear(); + } + self.index.put_embedding_configs(self.wtxn, configs)?; + // Clear the other databases. external_documents_ids.clear(self.wtxn)?; word_docids.clear(self.wtxn)?; diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index 76ec90d65..736c21c9f 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -8,18 +8,19 @@ use std::sync::Arc; use bytemuck::cast_slice; use grenad::Writer; -use itertools::EitherOrBoth; use ordered_float::OrderedFloat; +use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; +use crate::index::IndexEmbeddingConfig; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; -use crate::update::index_documents::helpers::try_split_at; use crate::update::settings::InnerIndexSettingsDiff; -use crate::vector::parsed_vectors::{ParsedVectorsDiff, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; +use crate::vector::settings::{EmbedderAction, ReindexAction}; use crate::vector::Embedder; -use crate::{DocumentId, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -35,6 +36,8 @@ pub struct ExtractedVectorPoints { // embedder pub embedder_name: String, pub embedder: Arc, + pub add_to_user_provided: RoaringBitmap, + pub remove_from_user_provided: RoaringBitmap, } enum VectorStateDelta { @@ -42,12 +45,7 @@ enum VectorStateDelta { // Remove all vectors, generated or manual, from this document NowRemoved, - // Add the manually specified vectors, passed in the other grenad - // Remove any previously generated vectors - // Note: changing the value of the manually specified vector **should not record** this delta - WasGeneratedNowManual(Vec>), - - ManualDelta(Vec>, Vec>), + NowManual(Vec>), // Add the vector computed from the specified prompt // Remove any previous vector @@ -56,14 +54,12 @@ enum VectorStateDelta { } impl VectorStateDelta { - fn into_values(self) -> (bool, String, (Vec>, Vec>)) { + fn into_values(self) -> (bool, String, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - VectorStateDelta::WasGeneratedNowManual(add) => { - (true, Default::default(), (Default::default(), add)) - } - VectorStateDelta::ManualDelta(del, add) => (false, Default::default(), (del, add)), + // We always delete the previous vectors + VectorStateDelta::NowManual(add) => (true, Default::default(), add), VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), } } @@ -74,12 +70,27 @@ struct EmbedderVectorExtractor { embedder: Arc, prompt: Arc, - // (docid, _index) -> KvWriterDelAdd -> Vector - manual_vectors_writer: Writer>, // (docid) -> (prompt) prompts_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, + // (docid, _index) -> KvWriterDelAdd -> Vector + manual_vectors_writer: Writer>, + // The docids of the documents that contains a user defined embedding + add_to_user_provided: RoaringBitmap, + + action: ExtractionAction, +} + +struct DocumentOperation { + // The docids of the documents that contains an auto-generated embedding + remove_from_user_provided: RoaringBitmap, +} + +enum ExtractionAction { + SettingsFullReindex, + SettingsRegeneratePrompts { old_prompt: Arc }, + DocumentOperation(DocumentOperation), } /// Extracts the embedding vector contained in each document under the `_vectors` field. @@ -89,6 +100,7 @@ struct EmbedderVectorExtractor { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, + embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, ) -> Result> { let reindex_vectors = settings_diff.reindex_vectors(); @@ -97,153 +109,207 @@ pub fn extract_vector_points( let new_fields_ids_map = &settings_diff.new.fields_ids_map; // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); - // filter the old vector fid if the settings has been changed forcing reindexing. - let old_vectors_fid = old_vectors_fid.filter(|_| !reindex_vectors); let new_vectors_fid = new_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); let mut extractors = Vec::new(); - for (embedder_name, (embedder, prompt)) in - settings_diff.new.embedding_configs.clone().into_iter() - { - // (docid, _index) -> KvWriterDelAdd -> Vector - let manual_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - // (docid) -> (prompt) - let prompts_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); + let old_configs = &settings_diff.old.embedding_configs; - // (docid) -> () - let remove_vectors_writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); + if reindex_vectors { + for (name, action) in settings_diff.embedding_config_updates.iter() { + match action { + EmbedderAction::WriteBackToDocuments(_) => continue, // already deleted + EmbedderAction::Reindex(action) => { + let Some((embedder_name, (embedder, prompt))) = configs.remove_entry(name) + else { + tracing::error!(embedder = name, "Requested embedder config not found"); + continue; + }; - extractors.push(EmbedderVectorExtractor { - embedder_name, - embedder, - prompt, - manual_vectors_writer, - prompts_writer, - remove_vectors_writer, - }); + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + let action = match action { + ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex, + ReindexAction::RegeneratePrompts => { + let Some((_, old_prompt)) = old_configs.get(name) else { + tracing::error!(embedder = name, "Old embedder config not found"); + continue; + }; + + ExtractionAction::SettingsRegeneratePrompts { old_prompt } + } + }; + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action, + }); + } + } + } + } else { + // document operation + + for (embedder_name, (embedder, prompt)) in configs.into_iter() { + // (docid, _index) -> KvWriterDelAdd -> Vector + let manual_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> (prompt) + let prompts_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + // (docid) -> () + let remove_vectors_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + extractors.push(EmbedderVectorExtractor { + embedder_name, + embedder, + prompt, + prompts_writer, + remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided: RoaringBitmap::new(), + action: ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided: RoaringBitmap::new(), + }), + }); + } } let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { // this must always be serialized as (docid, external_docid); + const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = - try_split_at(key, std::mem::size_of::()).unwrap(); + try_split_array_at::(key).unwrap(); debug_assert!(from_utf8(external_id_bytes).is_ok()); + let docid = DocumentId::from_be_bytes(docid_bytes); let obkv = obkv::KvReader::new(value); key_buffer.clear(); - key_buffer.extend_from_slice(docid_bytes); + key_buffer.extend_from_slice(docid_bytes.as_slice()); // since we only need the primary key when we throw an error we create this getter to // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; - let mut parsed_vectors = ParsedVectorsDiff::new(obkv, old_vectors_fid, new_vectors_fid) - .map_err(|error| error.to_crate_error(document_id().to_string()))?; + let mut parsed_vectors = ParsedVectorsDiff::new( + docid, + embedders_configs, + obkv, + old_vectors_fid, + new_vectors_fid, + ) + .map_err(|error| error.to_crate_error(document_id().to_string()))?; for EmbedderVectorExtractor { embedder_name, embedder: _, prompt, - manual_vectors_writer, prompts_writer, remove_vectors_writer, + manual_vectors_writer, + add_to_user_provided, + action, } in extractors.iter_mut() { - let delta = match parsed_vectors.remove(embedder_name) { - (Some(old), Some(new)) => { - // no autogeneration - let del_vectors = old.into_array_of_vectors(); - let add_vectors = new.into_array_of_vectors(); - - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::ManualDelta(del_vectors, add_vectors) - } - (Some(_old), None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - if document_is_kept { - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render( - obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) - } else { - VectorStateDelta::NowRemoved - } - } - (None, Some(new)) => { - // was possibly autogenerated, remove all vectors for that document - let add_vectors = new.into_array_of_vectors(); - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError(crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ))); - } - - VectorStateDelta::WasGeneratedNowManual(add_vectors) - } - (None, None) => { - // Do we keep this document? - let document_is_kept = obkv - .iter() - .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) - .any(|deladd| deladd.get(DelAdd::Addition).is_some()); - - if document_is_kept { - // Don't give up if the old prompt was failing - let old_prompt = Some(&prompt) - // TODO: this filter works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - .filter(|_| !settings_diff.reindex_vectors()) - .map(|p| { - p.render(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default() - }); - let new_prompt = - prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" - ); - VectorStateDelta::NowGenerated(new_prompt) - } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange + let (old, new) = parsed_vectors.remove(embedder_name); + let delta = match action { + ExtractionAction::SettingsFullReindex => match old { + // A full reindex can be triggered either by: + // 1. a new embedder + // 2. an existing embedder changed so that it must regenerate all generated embeddings. + // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB + VectorState::Inline(vectors) => { + if !vectors.must_regenerate() { + add_to_user_provided.insert(docid); } + + match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); + } + VectorStateDelta::NowManual(add_vectors) + } + None => VectorStateDelta::NoChange, + } + } + // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors + VectorState::Manual => VectorStateDelta::NoChange, + // generated vectors must be regenerated + VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, + }, + // prompt regeneration is only triggered for existing embedders + ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { + if old.must_regenerate() { + regenerate_if_prompt_changed( + obkv, + (old_prompt, prompt), + (&old_fields_ids_map, &new_fields_ids_map), + )? } else { - VectorStateDelta::NowRemoved + // we can simply ignore user provided vectors as they are not regenerated and are + // already in the DB since this is an existing embedder + VectorStateDelta::NoChange } } + ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) => extract_vector_document_diff( + docid, + obkv, + prompt, + (add_to_user_provided, remove_from_user_provided), + (old, new), + (&old_fields_ids_map, &new_fields_ids_map), + document_id, + )?, }; - // and we finally push the unique vectors into the writer push_vectors_diff( remove_vectors_writer, @@ -251,7 +317,6 @@ pub fn extract_vector_points( manual_vectors_writer, &mut key_buffer, delta, - reindex_vectors, )?; } } @@ -262,43 +327,185 @@ pub fn extract_vector_points( embedder_name, embedder, prompt: _, - manual_vectors_writer, prompts_writer, remove_vectors_writer, + action, + manual_vectors_writer, + add_to_user_provided, } in extractors { - results.push(ExtractedVectorPoints { - // docid, _index -> KvWriterDelAdd -> Vector - manual_vectors: writer_into_reader(manual_vectors_writer)?, - // docid -> () - remove_vectors: writer_into_reader(remove_vectors_writer)?, - // docid -> prompt - prompts: writer_into_reader(prompts_writer)?, + let remove_from_user_provided = + if let ExtractionAction::DocumentOperation(DocumentOperation { + remove_from_user_provided, + }) = action + { + remove_from_user_provided + } else { + Default::default() + }; + results.push(ExtractedVectorPoints { + manual_vectors: writer_into_reader(manual_vectors_writer)?, + remove_vectors: writer_into_reader(remove_vectors_writer)?, + prompts: writer_into_reader(prompts_writer)?, embedder, embedder_name, + add_to_user_provided, + remove_from_user_provided, }) } Ok(results) } -/// Computes the diff between both Del and Add numbers and -/// only inserts the parts that differ in the sorter. +fn extract_vector_document_diff( + docid: DocumentId, + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), + (old, new): (VectorState, VectorState), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), + document_id: impl Fn() -> Value, +) -> Result { + match (old.must_regenerate(), new.must_regenerate()) { + (true, true) | (false, false) => {} + (true, false) => { + add_to_user_provided.insert(docid); + } + (false, true) => { + remove_from_user_provided.insert(docid); + } + } + + let delta = match (old, new) { + // regardless of the previous state, if a document now contains inline _vectors, they must + // be extracted manually + (_old, VectorState::Inline(new)) => match new.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError(crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ))); + } + + VectorStateDelta::NowManual(add_vectors) + } + None => VectorStateDelta::NoChange, + }, + // no `_vectors` anywhere, we check for document removal and otherwise we regenerate the prompt if the + // document changed + (VectorState::Generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + + if document_is_kept { + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt).map(|p| { + p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() + }); + let new_prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange + } + } else { + VectorStateDelta::NowRemoved + } + } + // inline to the left is not supposed to be possible because the embedder is not new, so `_vectors` was removed from + // the previous version of the document. + // Manual -> Generated is also not possible without an Inline to the right (which is handled above) + // Generated -> Generated is handled above, so not possible + // As a result, this code is unreachable + (_not_generated, VectorState::Generated) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // becomes autogenerated + VectorStateDelta::NowGenerated(prompt.render( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + // inline to the left is not possible because the embedder is not new, and so `_vectors` was removed from the previous + // version of the document. + // however the Rust type system cannot know that. + (_manual, VectorState::Manual) => { + // Do we keep this document? + let document_is_kept = obkv + .iter() + .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) + .any(|deladd| deladd.get(DelAdd::Addition).is_some()); + if document_is_kept { + // if the new version of documents has the vectors in the DB, + // then they are user-provided and nothing possibly changed + VectorStateDelta::NoChange + } else { + // make sure the document is always removed from user provided on removal + remove_from_user_provided.insert(docid); + VectorStateDelta::NowRemoved + } + } + }; + + Ok(delta) +} + +fn regenerate_if_prompt_changed( + obkv: obkv::KvReader<'_, FieldId>, + (old_prompt, new_prompt): (&Prompt, &Prompt), + (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), +) -> Result { + let old_prompt = + old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default()); + let new_prompt = new_prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + if new_prompt == old_prompt { + return Ok(VectorStateDelta::NoChange); + } + Ok(VectorStateDelta::NowGenerated(new_prompt)) +} + +fn regenerate_prompt( + obkv: obkv::KvReader<'_, FieldId>, + prompt: &Prompt, + new_fields_ids_map: &FieldsIdsMap, +) -> Result { + let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; + + Ok(VectorStateDelta::NowGenerated(prompt)) +} + +/// We cannot compute the diff between both Del and Add vectors. +/// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, - reindex_vectors: bool, ) -> Result<()> { - let (must_remove, prompt, (mut del_vectors, mut add_vectors)) = delta.into_values(); - if must_remove - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - && !reindex_vectors - { + let (must_remove, prompt, mut add_vectors) = delta.into_values(); + if must_remove { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; } @@ -308,44 +515,22 @@ fn push_vectors_diff( } // We sort and dedup the vectors - del_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - del_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); - let merged_vectors_iter = - itertools::merge_join_by(del_vectors, add_vectors, |del, add| compare_vectors(del, add)); - // insert vectors into the writer - for (i, eob) in merged_vectors_iter.into_iter().enumerate().take(u16::MAX as usize) { + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { // Generate the key by extending the unique index to it. key_buffer.truncate(TRUNCATE_SIZE); let index = u16::try_from(i).unwrap(); key_buffer.extend_from_slice(&index.to_be_bytes()); - match eob { - EitherOrBoth::Both(_, _) => (), // no need to touch anything - EitherOrBoth::Left(vector) => { - // TODO: the below condition works because we erase the vec database when a embedding setting changes. - // When vector pipeline will be optimized, this should be removed. - if !reindex_vectors { - // We insert only the Del part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Deletion, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } - EitherOrBoth::Right(vector) => { - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; - } - } + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; } Ok(()) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 18340a3ae..9da3983fc 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -30,6 +30,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; +use crate::index::IndexEmbeddingConfig; use crate::update::settings::InnerIndexSettingsDiff; use crate::{FieldId, Result, ThreadPoolNoAbortBuilder}; @@ -43,6 +44,7 @@ pub(crate) fn data_from_obkv_documents( indexer: GrenadParameters, lmdb_writer_sx: Sender>, primary_key_id: FieldId, + embedders_configs: Arc>, settings_diff: Arc, max_positions_per_attributes: Option, ) -> Result<()> { @@ -55,6 +57,7 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), + embedders_configs.clone(), settings_diff.clone(), ) }) @@ -210,6 +213,7 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, + embedders_configs: Arc>, settings_diff: Arc, ) -> Result<()> { let original_documents_chunk = @@ -226,11 +230,17 @@ fn send_original_documents_data( if index_vectors { let settings_diff = settings_diff.clone(); + let embedders_configs = embedders_configs.clone(); let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); rayon::spawn(move || { - match extract_vector_points(original_documents_chunk.clone(), indexer, &settings_diff) { + match extract_vector_points( + original_documents_chunk.clone(), + indexer, + &embedders_configs, + &settings_diff, + ) { Ok(extracted_vectors) => { for ExtractedVectorPoints { manual_vectors, @@ -238,6 +248,8 @@ fn send_original_documents_data( prompts, embedder_name, embedder, + add_to_user_provided, + remove_from_user_provided, } in extracted_vectors { let embeddings = match extract_embeddings( @@ -262,6 +274,8 @@ fn send_original_documents_data( expected_dimension: embedder.dimensions(), manual_vectors, embedder_name, + add_to_user_provided, + remove_from_user_provided, })); } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2420463b4..3586c9c6d 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -286,6 +286,7 @@ where settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); + let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?); let backup_pool; let pool = match self.indexer_config.thread_pool { @@ -399,6 +400,7 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, + embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, ) @@ -501,6 +503,8 @@ where embeddings, manual_vectors, embedder_name, + add_to_user_provided, + remove_from_user_provided, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { @@ -509,6 +513,8 @@ where expected_dimension, manual_vectors, embedder_name, + add_to_user_provided, + remove_from_user_provided, } } otherwise => otherwise, @@ -781,6 +787,7 @@ mod tests { use super::*; use crate::documents::documents_batch_reader_from_objects; use crate::index::tests::TempIndex; + use crate::index::IndexEmbeddingConfig; use crate::search::TermsMatchingStrategy; use crate::update::Setting; use crate::{db_snap, Filter, Search}; @@ -2616,10 +2623,12 @@ mod tests { let rtxn = index.read_txn().unwrap(); let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let (embedder_name, embedder) = embedding_configs.pop().unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = + embedding_configs.pop().unwrap(); + insta::assert_snapshot!(embedder_name, @"manual"); + insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); let embedder = std::sync::Arc::new(crate::vector::Embedder::new(embedder.embedder_options).unwrap()); - assert_eq!("manual", embedder_name); let res = index .search(&rtxn) .semantic(embedder_name, embedder, Some([0.0, 1.0, 2.0].to_vec())) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c34b7876a..1dff29a90 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::btree_map::Entry as BEntry; use std::collections::hash_map::Entry as HEntry; -use std::collections::{HashMap, HashSet}; +use std::collections::{BTreeMap, HashMap, HashSet}; use std::fs::File; use std::io::{Read, Seek}; @@ -27,6 +27,8 @@ use crate::update::del_add::{ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; +use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; +use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; use crate::{ is_faceted_by, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, }; @@ -51,7 +53,6 @@ pub struct Transform<'a, 'i> { fields_ids_map: FieldsIdsMap, indexer_settings: &'a IndexerConfig, - pub autogenerate_docids: bool, pub index_documents_method: IndexDocumentsMethod, available_documents_ids: AvailableDocumentsIds, @@ -105,7 +106,7 @@ impl<'a, 'i> Transform<'a, 'i> { index: &'i Index, indexer_settings: &'a IndexerConfig, index_documents_method: IndexDocumentsMethod, - autogenerate_docids: bool, + _autogenerate_docids: bool, ) -> Result { // We must choose the appropriate merge function for when two or more documents // with the same user id must be merged or fully replaced in the same batch. @@ -139,7 +140,6 @@ impl<'a, 'i> Transform<'a, 'i> { index, fields_ids_map: index.fields_ids_map(wtxn)?, indexer_settings, - autogenerate_docids, available_documents_ids: AvailableDocumentsIds::from_documents_ids(&documents_ids), original_sorter, flattened_sorter, @@ -808,13 +808,13 @@ impl<'a, 'i> Transform<'a, 'i> { let mut new_inner_settings = old_inner_settings.clone(); new_inner_settings.fields_ids_map = fields_ids_map; - let embedding_configs_updated = false; + let embedding_config_updates = Default::default(); let settings_update_only = false; let settings_diff = InnerIndexSettingsDiff::new( old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -835,10 +835,13 @@ impl<'a, 'i> Transform<'a, 'i> { /// Rebind the field_ids of the provided document to their values /// based on the field_ids_maps difference between the old and the new settings, /// then fill the provided buffers with delta documents using KvWritterDelAdd. + #[allow(clippy::too_many_arguments)] // need the vectors + fid, feel free to create a struct xo xo fn rebind_existing_document( old_obkv: KvReader, settings_diff: &InnerIndexSettingsDiff, modified_faceted_fields: &HashSet, + mut injected_vectors: serde_json::Map, + old_vectors_fid: Option, original_obkv_buffer: Option<&mut Vec>, flattened_obkv_buffer: Option<&mut Vec>, ) -> Result<()> { @@ -861,9 +864,49 @@ impl<'a, 'i> Transform<'a, 'i> { // The operations that we must perform on the different fields. let mut operations = HashMap::new(); + let mut error_seen = false; let mut obkv_writer = KvWriter::<_, FieldId>::memory(); - for (id, val) in old_obkv.iter() { + 'write_fid: for (id, val) in old_obkv.iter() { + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + if id < vectors_fid { + break 'inject_vectors; + } + + let mut existing_vectors = if id == vectors_fid { + let existing_vectors: std::result::Result< + serde_json::Map, + serde_json::Error, + > = serde_json::from_slice(val); + + match existing_vectors { + Ok(existing_vectors) => existing_vectors, + Err(error) => { + if !error_seen { + tracing::error!(%error, "Unexpected `_vectors` field that is not a map. Treating as an empty map"); + error_seen = true; + } + Default::default() + } + } + } else { + Default::default() + }; + + existing_vectors.append(&mut injected_vectors); + + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer + .insert(vectors_fid, serde_json::to_vec(&existing_vectors).unwrap())?; + if id == vectors_fid { + continue 'write_fid; + } + } + } + if is_primary_key(id) || necessary_faceted_field(id) || reindex_vectors { operations.insert(id, DelAddOperation::DeletionAndAddition); obkv_writer.insert(id, val)?; @@ -872,6 +915,15 @@ impl<'a, 'i> Transform<'a, 'i> { obkv_writer.insert(id, val)?; } } + if !injected_vectors.is_empty() { + 'inject_vectors: { + let Some(vectors_fid) = old_vectors_fid else { break 'inject_vectors }; + + operations.insert(vectors_fid, DelAddOperation::DeletionAndAddition); + obkv_writer.insert(vectors_fid, serde_json::to_vec(&injected_vectors).unwrap())?; + } + } + let data = obkv_writer.into_inner()?; let obkv = KvReader::::new(&data); @@ -937,6 +989,35 @@ impl<'a, 'i> Transform<'a, 'i> { None }; + let readers: Result< + BTreeMap<&str, (Vec>, &RoaringBitmap)>, + > = settings_diff + .embedding_config_updates + .iter() + .filter_map(|(name, action)| { + if let EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }) = action + { + let readers: Result> = + self.index.arroy_readers(wtxn, *embedder_id).collect(); + match readers { + Ok(readers) => Some(Ok((name.as_str(), (readers, user_provided)))), + Err(error) => Some(Err(error)), + } + } else { + None + } + }) + .collect(); + let readers = readers?; + + let old_vectors_fid = settings_diff + .old + .fields_ids_map + .id(crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME); + // We initialize the sorter with the user indexing settings. let mut flattened_sorter = if settings_diff.reindex_searchable() || settings_diff.reindex_facets() { @@ -963,10 +1044,50 @@ impl<'a, 'i> Transform<'a, 'i> { InternalError::DatabaseMissingEntry { db_name: db_name::DOCUMENTS, key: None }, )?; + let injected_vectors: std::result::Result< + serde_json::Map, + arroy::Error, + > = readers + .iter() + .filter_map(|(name, (readers, user_provided))| { + if !user_provided.contains(docid) { + return None; + } + let mut vectors = Vec::new(); + for reader in readers { + let Some(vector) = reader.item_vector(wtxn, docid).transpose() else { + break; + }; + + match vector { + Ok(vector) => vectors.push(vector), + Err(error) => return Some(Err(error)), + } + } + if vectors.is_empty() { + return None; + } + Some(Ok(( + name.to_string(), + serde_json::to_value(ExplicitVectors { + embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( + vectors, + )), + regenerate: false, + }) + .unwrap(), + ))) + }) + .collect(); + + let injected_vectors = injected_vectors?; + Self::rebind_existing_document( old_obkv, &settings_diff, &modified_faceted_fields, + injected_vectors, + old_vectors_fid, Some(&mut original_obkv_buffer).filter(|_| original_sorter.is_some()), Some(&mut flattened_obkv_buffer).filter(|_| flattened_sorter.is_some()), )?; @@ -983,6 +1104,23 @@ impl<'a, 'i> Transform<'a, 'i> { } } + let mut writers = Vec::new(); + + // delete all vectors from the embedders that need removal + for (_, (readers, _)) in readers { + for reader in readers { + let dimensions = reader.dimensions(); + let arroy_index = reader.index(); + drop(reader); + let writer = arroy::Writer::new(self.index.vector_arroy, arroy_index, dimensions); + writers.push(writer); + } + } + + for writer in writers { + writer.clear(wtxn)?; + } + let grenad_params = GrenadParameters { chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_level: self.indexer_settings.chunk_compression_level, diff --git a/milli/src/update/index_documents/typed_chunk.rs b/milli/src/update/index_documents/typed_chunk.rs index 2fbe91685..4737c6b42 100644 --- a/milli/src/update/index_documents/typed_chunk.rs +++ b/milli/src/update/index_documents/typed_chunk.rs @@ -20,6 +20,7 @@ use super::MergeFn; use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; +use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -90,6 +91,8 @@ pub(crate) enum TypedChunk { expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, + add_to_user_provided: RoaringBitmap, + remove_from_user_provided: RoaringBitmap, }, ScriptLanguageDocids(HashMap<(Script, Language), (RoaringBitmap, RoaringBitmap)>), } @@ -154,8 +157,11 @@ pub(crate) fn write_typed_chunk_into_index( let mut docids = index.documents_ids(wtxn)?; let mut iter = merger.into_stream_merger_iter()?; - let embedders: BTreeSet<_> = - index.embedding_configs(wtxn)?.into_iter().map(|(k, _v)| k).collect(); + let embedders: BTreeSet<_> = index + .embedding_configs(wtxn)? + .into_iter() + .map(|IndexEmbeddingConfig { name, .. }| name) + .collect(); let mut vectors_buffer = Vec::new(); while let Some((key, reader)) = iter.next()? { let mut writer: KvWriter<_, FieldId> = KvWriter::memory(); @@ -181,7 +187,7 @@ pub(crate) fn write_typed_chunk_into_index( // if the `_vectors` field cannot be parsed as map of vectors, just write it as-is break 'vectors Some(addition); }; - vectors.retain_user_provided_vectors(&embedders); + vectors.retain_not_embedded_vectors(&embedders); let crate::vector::parsed_vectors::ParsedVectors(vectors) = vectors; if vectors.is_empty() { // skip writing empty `_vectors` map @@ -619,6 +625,8 @@ pub(crate) fn write_typed_chunk_into_index( let mut remove_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut manual_vectors_builder = MergerBuilder::new(keep_first as MergeFn); let mut embeddings_builder = MergerBuilder::new(keep_first as MergeFn); + let mut add_to_user_provided = RoaringBitmap::new(); + let mut remove_from_user_provided = RoaringBitmap::new(); let mut params = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { @@ -627,6 +635,8 @@ pub(crate) fn write_typed_chunk_into_index( embeddings, expected_dimension, embedder_name, + add_to_user_provided: aud, + remove_from_user_provided: rud, } = typed_chunk else { unreachable!(); @@ -639,11 +649,23 @@ pub(crate) fn write_typed_chunk_into_index( if let Some(embeddings) = embeddings { embeddings_builder.push(embeddings.into_cursor()?); } + add_to_user_provided |= aud; + remove_from_user_provided |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let mut embedding_configs = index.embedding_configs(wtxn)?; + let index_embedder_config = embedding_configs + .iter_mut() + .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) + .unwrap(); + index_embedder_config.user_provided -= remove_from_user_provided; + index_embedder_config.user_provided |= add_to_user_provided; + + index.put_embedding_configs(wtxn, embedding_configs)?; + let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, )?; diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index be9b6b74e..b792cde52 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -6,6 +6,7 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{EitherOrBoth, Itertools}; +use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -14,12 +15,18 @@ use super::index_documents::{IndexDocumentsConfig, Transform}; use super::IndexerConfig; use crate::criterion::Criterion; use crate::error::UserError; -use crate::index::{DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS}; +use crate::index::{ + IndexEmbeddingConfig, DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, +}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::{IndexDocuments, UpdateIndexingStep}; -use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; +use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; +use crate::vector::settings::{ + check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction, + WriteBackToDocuments, +}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{FieldId, FieldsIdsMap, Index, Result}; @@ -490,6 +497,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.index.put_all_searchable_fields_from_fields_ids_map( self.wtxn, &names, + &fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME), &fields_ids_map, )?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; @@ -919,92 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { Ok(changed) } - fn update_embedding_configs(&mut self) -> Result { - let update = match std::mem::take(&mut self.embedder_settings) { - Setting::Set(configs) => { - let mut changed = false; + fn update_embedding_configs(&mut self) -> Result> { + match std::mem::take(&mut self.embedder_settings) { + Setting::Set(configs) => self.update_embedding_configs_set(configs), + Setting::Reset => { + // all vectors should be written back to documents let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap> = - old_configs.into_iter().map(|(k, v)| (k, Setting::Set(v.into()))).collect(); - - let mut new_configs = BTreeMap::new(); - for joined in old_configs + let remove_all: Result> = old_configs .into_iter() - .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) - { - match joined { - // updated config - EitherOrBoth::Both((name, mut old), (_, new)) => { - changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); - if changed { - tracing::debug!(embedder = name, "need reindex"); - } else { - tracing::debug!(embedder = name, "skip reindex"); - } - let new = validate_embedding_settings(old, &name)?; - new_configs.insert(name, new); - } - // unchanged config - EitherOrBoth::Left((name, setting)) => { - new_configs.insert(name, setting); - } - // new config - EitherOrBoth::Right((name, mut setting)) => { - // apply the default source in case the source was not set so that it gets validated - crate::vector::settings::EmbeddingSettings::apply_default_source( - &mut setting, - ); - crate::vector::settings::EmbeddingSettings::apply_default_openai_model( - &mut setting, - ); - let setting = validate_embedding_settings(setting, &name)?; - changed = true; - new_configs.insert(name, setting); - } - } - } - let new_configs: Vec<(String, EmbeddingConfig)> = new_configs - .into_iter() - .filter_map(|(name, setting)| match setting { - Setting::Set(value) => Some((name, value.into())), - Setting::Reset => None, - Setting::NotSet => Some((name, EmbeddingSettings::default().into())), + .map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> { + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + Ok(( + name, + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + )) }) .collect(); + let remove_all = remove_all?; + self.index.embedder_category_id.clear(self.wtxn)?; - for (index, (embedder_name, _)) in new_configs.iter().enumerate() { - self.index.embedder_category_id.put_with_flags( - self.wtxn, - heed::PutFlags::APPEND, - embedder_name, - &index - .try_into() - .map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?, - )?; - } - - if new_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; - } else { - self.index.put_embedding_configs(self.wtxn, new_configs)?; - } - changed - } - Setting::Reset => { self.index.delete_embedding_configs(self.wtxn)?; - true + Ok(remove_all) } - Setting::NotSet => false, - }; - - // if any changes force a reindexing - // clear the vector database. - if update { - self.index.vector_arroy.clear(self.wtxn)?; + Setting::NotSet => Ok(Default::default()), } + } - Ok(update) + fn update_embedding_configs_set( + &mut self, + configs: BTreeMap>, + ) -> Result> { + use crate::vector::settings::SettingsDiff; + + let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs + .into_iter() + .map(|IndexEmbeddingConfig { name, config, user_provided }| { + (name, (config.into(), user_provided)) + }) + .collect(); + let mut updated_configs = BTreeMap::new(); + let mut embedder_actions = BTreeMap::new(); + for joined in old_configs + .into_iter() + .merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) + { + match joined { + // updated config + EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + let settings_diff = SettingsDiff::from_settings(old, new); + match settings_diff { + SettingsDiff::Remove => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "removing embedder" + ); + let embedder_id = + self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; + // free id immediately + self.index.embedder_category_id.delete(self.wtxn, &name)?; + embedder_actions.insert( + name, + EmbedderAction::WriteBackToDocuments(WriteBackToDocuments { + embedder_id, + user_provided, + }), + ); + } + SettingsDiff::Reindex { action, updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + ?action, + "reindex embedder" + ); + embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action)); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + SettingsDiff::UpdateWithoutReindex { updated_settings } => { + tracing::debug!( + embedder = name, + user_provided = user_provided.len(), + "update without reindex embedder" + ); + let new = + validate_embedding_settings(Setting::Set(updated_settings), &name)?; + updated_configs.insert(name, (new, user_provided)); + } + } + } + // unchanged config + EitherOrBoth::Left((name, (setting, user_provided))) => { + tracing::debug!(embedder = name, "unchanged embedder"); + updated_configs.insert(name, (Setting::Set(setting), user_provided)); + } + // new config + EitherOrBoth::Right((name, mut setting)) => { + tracing::debug!(embedder = name, "new embedder"); + // apply the default source in case the source was not set so that it gets validated + crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting); + crate::vector::settings::EmbeddingSettings::apply_default_openai_model( + &mut setting, + ); + let setting = validate_embedding_settings(setting, &name)?; + embedder_actions + .insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex)); + updated_configs.insert(name, (setting, RoaringBitmap::new())); + } + } + } + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + for res in self.index.embedder_category_id.iter(self.wtxn)? { + let (_name, id) = res?; + free_indices[id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + for (name, action) in embedder_actions.iter() { + match action { + EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { + /* cannot be a new embedder, so has to have an id already */ + } + EmbedderAction::Reindex(ReindexAction::FullReindex) => { + if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() { + let id = find_free_index() + .ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; + tracing::debug!(embedder = name, id, "assigning free id to new embedder"); + self.index.embedder_category_id.put(self.wtxn, name, &id)?; + } + } + EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ } + } + } + let updated_configs: Vec = updated_configs + .into_iter() + .filter_map(|(name, (config, user_provided))| match config { + Setting::Set(config) => { + Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + } + Setting::Reset => None, + Setting::NotSet => Some(IndexEmbeddingConfig { + name, + config: EmbeddingSettings::default().into(), + user_provided, + }), + }) + .collect(); + if updated_configs.is_empty() { + self.index.delete_embedding_configs(self.wtxn)?; + } else { + self.index.put_embedding_configs(self.wtxn, updated_configs)?; + } + Ok(embedder_actions) } fn update_search_cutoff(&mut self) -> Result { @@ -1058,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { self.update_searchable()?; self.update_exact_attributes()?; self.update_proximity_precision()?; - // TODO: very rough approximation of the needs for reindexing where any change will result in - // a full reindexing. - // What can be done instead: - // 1. Only change the distance on a distance change - // 2. Only change the name -> embedder mapping on a name change - // 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage - let embedding_configs_updated = self.update_embedding_configs()?; + + let embedding_config_updates = self.update_embedding_configs()?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?; @@ -1078,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { old_inner_settings, new_inner_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, ); @@ -1094,8 +1182,7 @@ pub struct InnerIndexSettingsDiff { pub(crate) old: InnerIndexSettings, pub(crate) new: InnerIndexSettings, pub(crate) primary_key_id: Option, - // TODO: compare directly the embedders. - pub(crate) embedding_configs_updated: bool, + pub(crate) embedding_config_updates: BTreeMap, pub(crate) settings_update_only: bool, /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. @@ -1116,7 +1203,7 @@ impl InnerIndexSettingsDiff { old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, primary_key_id: Option, - embedding_configs_updated: bool, + embedding_config_updates: BTreeMap, settings_update_only: bool, ) -> Self { let only_additional_fields = match ( @@ -1153,7 +1240,7 @@ impl InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, - embedding_configs_updated, + embedding_config_updates, settings_update_only, only_additional_fields, cache_reindex_searchable_without_user_defined, @@ -1220,7 +1307,7 @@ impl InnerIndexSettingsDiff { } pub fn reindex_vectors(&self) -> bool { - self.embedding_configs_updated + !self.embedding_config_updates.is_empty() } pub fn settings_update_only(&self) -> bool { @@ -1252,6 +1339,8 @@ pub(crate) struct InnerIndexSettings { pub embedding_configs: EmbeddingConfigs, pub existing_fields: HashSet, pub geo_fields_ids: Option<(FieldId, FieldId)>, + pub non_searchable_fields_ids: Vec, + pub non_faceted_fields_ids: Vec, } impl InnerIndexSettings { @@ -1265,8 +1354,8 @@ impl InnerIndexSettings { let user_defined_searchable_fields = user_defined_searchable_fields.map(|sf| sf.into_iter().map(String::from).collect()); let user_defined_faceted_fields = index.user_defined_faceted_fields(rtxn)?; - let searchable_fields_ids = index.searchable_fields_ids(rtxn)?; - let faceted_fields_ids = index.faceted_fields_ids(rtxn)?; + let mut searchable_fields_ids = index.searchable_fields_ids(rtxn)?; + let mut faceted_fields_ids = index.faceted_fields_ids(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); let embedding_configs = embedders(index.embedding_configs(rtxn)?)?; @@ -1294,6 +1383,10 @@ impl InnerIndexSettings { None => None, }; + let vectors_fids = fields_ids_map.nested_ids(RESERVED_VECTORS_FIELD_NAME); + searchable_fields_ids.retain(|id| !vectors_fids.contains(id)); + faceted_fields_ids.retain(|id| !vectors_fids.contains(id)); + Ok(Self { stop_words, allowed_separators, @@ -1308,6 +1401,8 @@ impl InnerIndexSettings { embedding_configs, existing_fields, geo_fields_ids, + non_searchable_fields_ids: vectors_fids.clone(), + non_faceted_fields_ids: vectors_fids.clone(), }) } @@ -1315,9 +1410,10 @@ impl InnerIndexSettings { pub fn recompute_facets(&mut self, wtxn: &mut heed::RwTxn, index: &Index) -> Result<()> { let new_facets = self .fields_ids_map - .names() - .filter(|&field| crate::is_faceted(field, &self.user_defined_faceted_fields)) - .map(|field| field.to_string()) + .iter() + .filter(|(fid, _field)| !self.non_faceted_fields_ids.contains(fid)) + .filter(|(_fid, field)| crate::is_faceted(field, &self.user_defined_faceted_fields)) + .map(|(_fid, field)| field.to_string()) .collect(); index.put_faceted_fields(wtxn, &new_facets)?; @@ -1337,6 +1433,7 @@ impl InnerIndexSettings { index.put_all_searchable_fields_from_fields_ids_map( wtxn, &searchable_fields, + &self.non_searchable_fields_ids, &self.fields_ids_map, )?; } @@ -1347,19 +1444,25 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec<(String, EmbeddingConfig)>) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() - .map(|(name, EmbeddingConfig { embedder_options, prompt })| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + .map( + |IndexEmbeddingConfig { + name, + config: EmbeddingConfig { embedder_options, prompt }, + .. + }| { + let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); - let embedder = Arc::new( - Embedder::new(embedder_options.clone()) - .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt))) - }) + let embedder = Arc::new( + Embedder::new(embedder_options.clone()) + .map_err(crate::vector::Error::from) + .map_err(crate::Error::from)?, + ); + Ok((name, (embedder, prompt))) + }, + ) .collect(); res.map(EmbeddingConfigs::new) } diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index 553c8c3c1..c43fa8bd2 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -152,6 +152,10 @@ impl EmbeddingConfigs { &self.0 } + pub fn into_inner(self) -> HashMap, Arc)> { + self.0 + } + /// Get the name of the default embedder configuration. /// /// The default embedder is determined as follows: diff --git a/milli/src/vector/parsed_vectors.rs b/milli/src/vector/parsed_vectors.rs index 2c61baa9e..92d6cb382 100644 --- a/milli/src/vector/parsed_vectors.rs +++ b/milli/src/vector/parsed_vectors.rs @@ -4,8 +4,9 @@ use obkv::KvReader; use serde_json::{from_slice, Value}; use super::Embedding; +use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{FieldId, InternalError, UserError}; +use crate::{DocumentId, FieldId, InternalError, UserError}; pub const RESERVED_VECTORS_FIELD_NAME: &str = "_vectors"; @@ -17,11 +18,20 @@ pub enum Vectors { } impl Vectors { - pub fn into_array_of_vectors(self) -> Vec { + pub fn must_regenerate(&self) -> bool { match self { - Vectors::ImplicitlyUserProvided(embeddings) - | Vectors::Explicit(ExplicitVectors { embeddings, user_provided: _ }) => { - embeddings.into_array_of_vectors().unwrap_or_default() + Vectors::ImplicitlyUserProvided(_) => false, + Vectors::Explicit(ExplicitVectors { regenerate, .. }) => *regenerate, + } + } + + pub fn into_array_of_vectors(self) -> Option> { + match self { + Vectors::ImplicitlyUserProvided(embeddings) => { + Some(embeddings.into_array_of_vectors().unwrap_or_default()) + } + Vectors::Explicit(ExplicitVectors { embeddings, regenerate: _ }) => { + embeddings.map(|embeddings| embeddings.into_array_of_vectors().unwrap_or_default()) } } } @@ -30,22 +40,46 @@ impl Vectors { #[derive(serde::Serialize, serde::Deserialize, Debug)] #[serde(rename_all = "camelCase")] pub struct ExplicitVectors { - pub embeddings: VectorOrArrayOfVectors, - pub user_provided: bool, + pub embeddings: Option, + pub regenerate: bool, +} + +pub enum VectorState { + Inline(Vectors), + Manual, + Generated, +} + +impl VectorState { + pub fn must_regenerate(&self) -> bool { + match self { + VectorState::Inline(vectors) => vectors.must_regenerate(), + VectorState::Manual => false, + VectorState::Generated => true, + } + } +} + +pub enum VectorsState { + NoVectorsFid, + NoVectorsFieldInDocument, + Vectors(BTreeMap), } pub struct ParsedVectorsDiff { - pub old: Option>, - pub new: Option>, + old: BTreeMap, + new: VectorsState, } impl ParsedVectorsDiff { pub fn new( + docid: DocumentId, + embedders_configs: &[IndexEmbeddingConfig], documents_diff: KvReader<'_, FieldId>, old_vectors_fid: Option, new_vectors_fid: Option, ) -> Result { - let old = match old_vectors_fid + let mut old = match old_vectors_fid .and_then(|vectors_fid| documents_diff.get(vectors_fid)) .map(KvReaderDelAdd::new) .map(|obkv| to_vector_map(obkv, DelAdd::Deletion)) @@ -61,19 +95,54 @@ impl ParsedVectorsDiff { return Err(error); } } - .flatten(); - let new = new_vectors_fid - .and_then(|vectors_fid| documents_diff.get(vectors_fid)) - .map(KvReaderDelAdd::new) - .map(|obkv| to_vector_map(obkv, DelAdd::Addition)) - .transpose()? - .flatten(); + .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); + for embedding_config in embedders_configs { + if embedding_config.user_provided.contains(docid) { + old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); + } + } + + let new = 'new: { + let Some(new_vectors_fid) = new_vectors_fid else { + break 'new VectorsState::NoVectorsFid; + }; + let Some(bytes) = documents_diff.get(new_vectors_fid) else { + break 'new VectorsState::NoVectorsFieldInDocument; + }; + let obkv = KvReaderDelAdd::new(bytes); + match to_vector_map(obkv, DelAdd::Addition)? { + Some(new) => VectorsState::Vectors(new), + None => VectorsState::NoVectorsFieldInDocument, + } + }; + Ok(Self { old, new }) } - pub fn remove(&mut self, embedder_name: &str) -> (Option, Option) { - let old = self.old.as_mut().and_then(|old| old.remove(embedder_name)); - let new = self.new.as_mut().and_then(|new| new.remove(embedder_name)); + pub fn remove(&mut self, embedder_name: &str) -> (VectorState, VectorState) { + let old = self.old.remove(embedder_name).unwrap_or(VectorState::Generated); + let state_from_old = match old { + // assume a userProvided is still userProvided + VectorState::Manual => VectorState::Manual, + // generated is still generated + VectorState::Generated => VectorState::Generated, + // weird case that shouldn't happen were the previous docs version is inline, + // but it was removed in the new version + // Since it is not in the new version, we switch to generated + VectorState::Inline(_) => VectorState::Generated, + }; + let new = match &mut self.new { + VectorsState::Vectors(new) => { + new.remove(embedder_name).map(VectorState::Inline).unwrap_or(state_from_old) + } + _ => + // if no `_vectors` field is present in the new document, + // the state depends on the previous version of the document + { + state_from_old + } + }; + (old, new) } } @@ -89,15 +158,8 @@ impl ParsedVectors { Ok(ParsedVectors(value)) } - pub fn retain_user_provided_vectors(&mut self, embedders: &BTreeSet) { - self.0.retain(|k, v| match v { - Vectors::ImplicitlyUserProvided(_) => true, - Vectors::Explicit(ExplicitVectors { embeddings: _, user_provided }) => { - *user_provided - // if the embedder is not in the config, then never touch it - || !embedders.contains(k) - } - }); + pub fn retain_not_embedded_vectors(&mut self, embedders: &BTreeSet) { + self.0.retain(|k, _v| !embedders.contains(k)) } } @@ -150,6 +212,22 @@ impl VectorOrArrayOfVectors { pub fn from_array_of_vectors(array_of_vec: Vec) -> Self { Self { inner: Some(either::Either::Left(array_of_vec)) } } + + pub fn from_vector(vec: Embedding) -> Self { + Self { inner: Some(either::Either::Right(vec)) } + } +} + +impl From for VectorOrArrayOfVectors { + fn from(vec: Embedding) -> Self { + Self::from_vector(vec) + } +} + +impl From> for VectorOrArrayOfVectors { + fn from(vec: Vec) -> Self { + Self::from_array_of_vectors(vec) + } } #[cfg(test)] diff --git a/milli/src/vector/settings.rs b/milli/src/vector/settings.rs index e786a7164..9c7fb09b1 100644 --- a/milli/src/vector/settings.rs +++ b/milli/src/vector/settings.rs @@ -1,4 +1,5 @@ use deserr::Deserr; +use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use super::rest::InputType; @@ -72,6 +73,238 @@ pub fn check_unset( } } +/// Indicates what action should take place during a reindexing operation for an embedder +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum ReindexAction { + /// An indexing operation should take place for this embedder, keeping existing vectors + /// and checking whether the document template changed or not + RegeneratePrompts, + /// An indexing operation should take place for all documents for this embedder, removing existing vectors + /// (except userProvided ones) + FullReindex, +} + +pub enum SettingsDiff { + Remove, + Reindex { action: ReindexAction, updated_settings: EmbeddingSettings }, + UpdateWithoutReindex { updated_settings: EmbeddingSettings }, +} + +pub enum EmbedderAction { + WriteBackToDocuments(WriteBackToDocuments), + Reindex(ReindexAction), +} + +pub struct WriteBackToDocuments { + pub embedder_id: u8, + pub user_provided: RoaringBitmap, +} + +impl SettingsDiff { + pub fn from_settings(old: EmbeddingSettings, new: Setting) -> Self { + match new { + Setting::Set(new) => { + let EmbeddingSettings { + mut source, + mut model, + mut revision, + mut api_key, + mut dimensions, + mut document_template, + mut url, + mut query, + mut input_field, + mut path_to_embeddings, + mut embedding_object, + mut input_type, + mut distribution, + } = old; + + let EmbeddingSettings { + source: new_source, + model: new_model, + revision: new_revision, + api_key: new_api_key, + dimensions: new_dimensions, + document_template: new_document_template, + url: new_url, + query: new_query, + input_field: new_input_field, + path_to_embeddings: new_path_to_embeddings, + embedding_object: new_embedding_object, + input_type: new_input_type, + distribution: new_distribution, + } = new; + + let mut reindex_action = None; + + // **Warning**: do not use short-circuiting || here, we want all these operations applied + if source.apply(new_source) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + // when the source changes, we need to reapply the default settings for the new source + apply_default_for_source( + &source, + &mut model, + &mut revision, + &mut dimensions, + &mut url, + &mut query, + &mut input_field, + &mut path_to_embeddings, + &mut embedding_object, + &mut input_type, + &mut document_template, + ) + } + if model.apply(new_model) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if revision.apply(new_revision) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if dimensions.apply(new_dimensions) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if url.apply(new_url) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if query.apply(new_query) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_field.apply(new_input_field) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if path_to_embeddings.apply(new_path_to_embeddings) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if embedding_object.apply(new_embedding_object) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if input_type.apply(new_input_type) { + ReindexAction::push_action(&mut reindex_action, ReindexAction::FullReindex); + } + if document_template.apply(new_document_template) { + ReindexAction::push_action( + &mut reindex_action, + ReindexAction::RegeneratePrompts, + ); + } + + distribution.apply(new_distribution); + api_key.apply(new_api_key); + + let updated_settings = EmbeddingSettings { + source, + model, + revision, + api_key, + dimensions, + document_template, + url, + query, + input_field, + path_to_embeddings, + embedding_object, + input_type, + distribution, + }; + + match reindex_action { + Some(action) => Self::Reindex { action, updated_settings }, + None => Self::UpdateWithoutReindex { updated_settings }, + } + } + Setting::Reset => Self::Remove, + Setting::NotSet => Self::UpdateWithoutReindex { updated_settings: old }, + } + } +} + +impl ReindexAction { + fn push_action(this: &mut Option, other: Self) { + *this = match (*this, other) { + (_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), + (Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), + (_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), + } + } +} + +#[allow(clippy::too_many_arguments)] // private function +fn apply_default_for_source( + source: &Setting, + model: &mut Setting, + revision: &mut Setting, + dimensions: &mut Setting, + url: &mut Setting, + query: &mut Setting, + input_field: &mut Setting>, + path_to_embeddings: &mut Setting>, + embedding_object: &mut Setting>, + input_type: &mut Setting, + document_template: &mut Setting, +) { + match source { + Setting::Set(EmbedderSource::HuggingFace) => { + *model = Setting::Reset; + *revision = Setting::Reset; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Ollama) => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::OpenAi) | Setting::Reset => { + *model = Setting::Reset; + *revision = Setting::NotSet; + *dimensions = Setting::NotSet; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + } + Setting::Set(EmbedderSource::Rest) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::Reset; + *query = Setting::Reset; + *input_field = Setting::Reset; + *path_to_embeddings = Setting::Reset; + *embedding_object = Setting::Reset; + *input_type = Setting::Reset; + } + Setting::Set(EmbedderSource::UserProvided) => { + *model = Setting::NotSet; + *revision = Setting::NotSet; + *dimensions = Setting::Reset; + *url = Setting::NotSet; + *query = Setting::NotSet; + *input_field = Setting::NotSet; + *path_to_embeddings = Setting::NotSet; + *embedding_object = Setting::NotSet; + *input_type = Setting::NotSet; + *document_template = Setting::NotSet; + } + Setting::NotSet => {} + } +} + pub fn check_set( key: &Setting, field: &'static str, @@ -210,66 +443,6 @@ impl EmbeddingSettings { *model = Setting::Set(openai::EmbeddingModel::default().name().to_owned()) } } - - pub(crate) fn apply_and_need_reindex( - old: &mut Setting, - new: Setting, - ) -> bool { - match (old, new) { - ( - Setting::Set(EmbeddingSettings { - source: old_source, - model: old_model, - revision: old_revision, - api_key: old_api_key, - dimensions: old_dimensions, - document_template: old_document_template, - url: old_url, - query: old_query, - input_field: old_input_field, - path_to_embeddings: old_path_to_embeddings, - embedding_object: old_embedding_object, - input_type: old_input_type, - distribution: old_distribution, - }), - Setting::Set(EmbeddingSettings { - source: new_source, - model: new_model, - revision: new_revision, - api_key: new_api_key, - dimensions: new_dimensions, - document_template: new_document_template, - url: new_url, - query: new_query, - input_field: new_input_field, - path_to_embeddings: new_path_to_embeddings, - embedding_object: new_embedding_object, - input_type: new_input_type, - distribution: new_distribution, - }), - ) => { - let mut needs_reindex = false; - - needs_reindex |= old_source.apply(new_source); - needs_reindex |= old_model.apply(new_model); - needs_reindex |= old_revision.apply(new_revision); - needs_reindex |= old_dimensions.apply(new_dimensions); - needs_reindex |= old_document_template.apply(new_document_template); - needs_reindex |= old_url.apply(new_url); - needs_reindex |= old_query.apply(new_query); - needs_reindex |= old_input_field.apply(new_input_field); - needs_reindex |= old_path_to_embeddings.apply(new_path_to_embeddings); - needs_reindex |= old_embedding_object.apply(new_embedding_object); - needs_reindex |= old_input_type.apply(new_input_type); - - old_distribution.apply(new_distribution); - old_api_key.apply(new_api_key); - needs_reindex - } - (Setting::Reset, Setting::Reset) | (_, Setting::NotSet) => false, - _ => true, - } - } } #[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq, Deserr)] diff --git a/xtask/Cargo.toml b/xtask/Cargo.toml index 562dfddb3..a618b06a5 100644 --- a/xtask/Cargo.toml +++ b/xtask/Cargo.toml @@ -21,7 +21,7 @@ reqwest = { version = "0.11.23", features = [ "stream", "json", "rustls-tls", -], default_features = false } +], default-features = false } serde = { version = "1.0.195", features = ["derive"] } serde_json = "1.0.111" sha2 = "0.10.8"