diff --git a/crates/benchmarks/benches/indexing.rs b/crates/benchmarks/benches/indexing.rs index 16e7a2f81..4083b69dd 100644 --- a/crates/benchmarks/benches/indexing.rs +++ b/crates/benchmarks/benches/indexing.rs @@ -11,7 +11,7 @@ use milli::heed::{EnvOpenOptions, RwTxn}; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{FilterableAttributesRule, Index}; use rand::seq::SliceRandom; use rand_chacha::rand_core::SeedableRng; @@ -166,7 +166,7 @@ fn indexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -233,7 +233,7 @@ fn reindexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -278,7 +278,7 @@ fn reindexing_songs_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -347,7 +347,7 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -424,7 +424,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -469,7 +469,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -510,7 +510,7 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -578,7 +578,7 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -645,7 +645,7 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -712,7 +712,7 @@ fn indexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -778,7 +778,7 @@ fn reindexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -823,7 +823,7 @@ fn reindexing_wiki(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -891,7 +891,7 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -968,7 +968,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1014,7 +1014,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1056,7 +1056,7 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1123,7 +1123,7 @@ fn indexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1189,7 +1189,7 @@ fn reindexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1234,7 +1234,7 @@ fn reindexing_movies_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1302,7 +1302,7 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -1351,7 +1351,7 @@ fn delete_documents_from_ids(index: Index, document_ids_to_delete: Vec Index { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), diff --git a/crates/fuzzers/src/bin/fuzz-indexing.rs b/crates/fuzzers/src/bin/fuzz-indexing.rs index 0632b7846..ec1f96fd5 100644 --- a/crates/fuzzers/src/bin/fuzz-indexing.rs +++ b/crates/fuzzers/src/bin/fuzz-indexing.rs @@ -13,7 +13,7 @@ use milli::heed::EnvOpenOptions; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::IndexerConfig; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::Index; use serde_json::Value; use tempfile::TempDir; @@ -89,7 +89,7 @@ fn main() { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut operations = Vec::new(); diff --git a/crates/index-scheduler/src/features.rs b/crates/index-scheduler/src/features.rs index 78ffc0766..b52a659a6 100644 --- a/crates/index-scheduler/src/features.rs +++ b/crates/index-scheduler/src/features.rs @@ -144,6 +144,19 @@ impl RoFeatures { .into()) } } + + pub fn check_multimodal(&self, disabled_action: &'static str) -> Result<()> { + if self.runtime.multimodal { + Ok(()) + } else { + Err(FeatureNotEnabledError { + disabled_action, + feature: "multimodal", + issue_link: "https://github.com/orgs/meilisearch/discussions/846", + } + .into()) + } + } } impl FeatureData { diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 505ce23f8..b2f27d66b 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -57,12 +57,15 @@ use meilisearch_types::features::{ use meilisearch_types::heed::byteorder::BE; use meilisearch_types::heed::types::{DecodeIgnore, SerdeJson, Str, I128}; use meilisearch_types::heed::{self, Database, Env, RoTxn, WithoutTls}; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexerConfig; -use meilisearch_types::milli::vector::{Embedder, EmbedderOptions, EmbeddingConfigs}; +use meilisearch_types::milli::vector::json_template::JsonTemplate; +use meilisearch_types::milli::vector::{ + Embedder, EmbedderOptions, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, +}; use meilisearch_types::milli::{self, Index}; use meilisearch_types::task_view::TaskView; use meilisearch_types::tasks::{KindWithContent, Task}; +use milli::vector::db::IndexEmbeddingConfig; use processing::ProcessingTasks; pub use queue::Query; use queue::Queue; @@ -851,29 +854,42 @@ impl IndexScheduler { &self, index_uid: String, embedding_configs: Vec, - ) -> Result { + ) -> Result { let res: Result<_> = embedding_configs .into_iter() .map( |IndexEmbeddingConfig { name, config: milli::vector::EmbeddingConfig { embedder_options, prompt, quantized }, - .. - }| { - let prompt = Arc::new( - prompt - .try_into() - .map_err(meilisearch_types::milli::Error::from) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?, - ); + fragments, + }| + -> Result<(String, Arc)> { + let document_template = prompt + .try_into() + .map_err(meilisearch_types::milli::Error::from) + .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + + let fragments = fragments + .into_inner() + .into_iter() + .map(|fragment| { + let value = embedder_options.fragment(&fragment.name).unwrap(); + let template = JsonTemplate::new(value.clone()).unwrap(); + RuntimeFragment { name: fragment.name, id: fragment.id, template } + }) + .collect(); // optimistically return existing embedder { let embedders = self.embedders.read().unwrap(); if let Some(embedder) = embedders.get(&embedder_options) { - return Ok(( - name, - (embedder.clone(), prompt, quantized.unwrap_or_default()), + let runtime = Arc::new(RuntimeEmbedder::new( + embedder.clone(), + document_template, + fragments, + quantized.unwrap_or_default(), )); + + return Ok((name, runtime)); } } @@ -889,11 +905,19 @@ impl IndexScheduler { let mut embedders = self.embedders.write().unwrap(); embedders.insert(embedder_options, embedder.clone()); } - Ok((name, (embedder, prompt, quantized.unwrap_or_default()))) + + let runtime = Arc::new(RuntimeEmbedder::new( + embedder.clone(), + document_template, + fragments, + quantized.unwrap_or_default(), + )); + + Ok((name, runtime)) }, ) .collect(); - res.map(EmbeddingConfigs::new) + res.map(RuntimeEmbedders::new) } pub fn chat_settings(&self, uid: &str) -> Result> { diff --git a/crates/index-scheduler/src/scheduler/process_dump_creation.rs b/crates/index-scheduler/src/scheduler/process_dump_creation.rs index a6d785b2f..ec1be0e93 100644 --- a/crates/index-scheduler/src/scheduler/process_dump_creation.rs +++ b/crates/index-scheduler/src/scheduler/process_dump_creation.rs @@ -165,9 +165,6 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index - .embedding_configs(&rtxn) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; let nb_documents = index .number_of_documents(&rtxn) @@ -221,16 +218,12 @@ impl IndexScheduler { return Err(Error::from_milli(user_err, Some(uid.to_string()))); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(id)); + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate: !user_provided, + regenerate, }; vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap()); } diff --git a/crates/index-scheduler/src/scheduler/process_export.rs b/crates/index-scheduler/src/scheduler/process_export.rs index 30721065e..2062e1c28 100644 --- a/crates/index-scheduler/src/scheduler/process_export.rs +++ b/crates/index-scheduler/src/scheduler/process_export.rs @@ -150,9 +150,6 @@ impl IndexScheduler { let fields_ids_map = index.fields_ids_map(&index_rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index - .embedding_configs(&index_rtxn) - .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // We don't need to keep this one alive as we will // spawn many threads to process the documents @@ -232,17 +229,12 @@ impl IndexScheduler { )); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(docid)); - + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some( VectorOrArrayOfVectors::from_array_of_vectors(embeddings), ), - regenerate: !user_provided, + regenerate, }; vectors.insert( embedder_name, diff --git a/crates/index-scheduler/src/scheduler/process_index_operation.rs b/crates/index-scheduler/src/scheduler/process_index_operation.rs index 04aaf9a84..62d0e6545 100644 --- a/crates/index-scheduler/src/scheduler/process_index_operation.rs +++ b/crates/index-scheduler/src/scheduler/process_index_operation.rs @@ -89,8 +89,9 @@ impl IndexScheduler { let mut content_files_iter = content_files.iter(); let mut indexer = indexer::DocumentOperation::new(); let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|e| Error::from_milli(e, Some(index_uid.clone())))?; + .map_err(|e| Error::from_milli(e.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; for operation in operations { match operation { @@ -274,8 +275,9 @@ impl IndexScheduler { }) .unwrap()?; let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentEditionProgress::Indexing); @@ -423,8 +425,9 @@ impl IndexScheduler { indexer.delete_documents_by_docids(to_delete); let document_changes = indexer.into_changes(&indexer_alloc, primary_key); let embedders = index + .embedding_configs() .embedding_configs(index_wtxn) - .map_err(|err| Error::from_milli(err, Some(index_uid.clone())))?; + .map_err(|err| Error::from_milli(err.into(), Some(index_uid.clone())))?; let embedders = self.embedders(index_uid.clone(), embedders)?; progress.update_progress(DocumentDeletionProgress::Indexing); diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap new file mode 100644 index 000000000..82134b838 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test__settings_update-7.snap @@ -0,0 +1,17 @@ +--- +source: crates/index-scheduler/src/scheduler/test.rs +expression: config.embedder_options +--- +{ + "Rest": { + "api_key": "My super secret", + "distribution": null, + "dimensions": 4, + "url": "http://localhost:7777", + "request": "{{text}}", + "search_fragments": {}, + "indexing_fragments": {}, + "response": "{{embedding}}", + "headers": {} + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap new file mode 100644 index 000000000..19b5cab92 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-14.snap @@ -0,0 +1,12 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: simple_hf_config.embedder_options +--- +{ + "HuggingFace": { + "model": "sentence-transformers/all-MiniLM-L6-v2", + "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "distribution": null, + "pooling": "useModel" + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap new file mode 100644 index 000000000..0fc8bd531 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-27.snap @@ -0,0 +1,15 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: doc +--- +{ + "doggo": "Intel", + "breed": "beagle", + "_vectors": { + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap new file mode 100644 index 000000000..0942e4d82 --- /dev/null +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-40.snap @@ -0,0 +1,15 @@ +--- +source: crates/index-scheduler/src/scheduler/test_embedders.rs +expression: doc +--- +{ + "doggo": "kefir", + "breed": "patou", + "_vectors": { + "noise": [ + 0.1, + 0.2, + 0.3 + ] + } +} diff --git a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap index 19b5cab92..29f35d9c1 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/index_scheduler__scheduler__test_embedders__import_vectors-8.snap @@ -1,12 +1,17 @@ --- source: crates/index-scheduler/src/scheduler/test_embedders.rs -expression: simple_hf_config.embedder_options +expression: fakerest_config.embedder_options --- { - "HuggingFace": { - "model": "sentence-transformers/all-MiniLM-L6-v2", - "revision": "e4ce9877abf3edfe10b0d82785e83bdcb973e22e", + "Rest": { + "api_key": "My super secret", "distribution": null, - "pooling": "useModel" + "dimensions": 384, + "url": "http://localhost:7777", + "request": "{{text}}", + "search_fragments": {}, + "indexing_fragments": {}, + "response": "{{embedding}}", + "headers": {} } } diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap index c66a6b5b3..a52f18079 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/after_registering_settings_task.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap index b7faefa8a..b99e15852 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test.rs/test_settings_update/settings_update_processed.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"default": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(4), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap index c8955e2b6..12e03a28b 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, batch_uid: 2, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap index 23e43860f..2ea2ebb17 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/Intel to kefir.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} 2 {uid: 2, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: None, method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000001, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap index 732527fa8..a2a263b6f 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/adding Intel succeeds.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, batch_uid: 1, status: succeeded, details: { received_documents: 1, indexed_documents: Some(1) }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap index 5e01ffcdf..29fc6abf4 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after adding Intel.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} 1 {uid: 1, status: enqueued, details: { received_documents: 1, indexed_documents: None }, kind: DocumentAdditionOrUpdate { index_uid: "doggos", primary_key: Some("id"), method: UpdateDocuments, content_file: 00000000-0000-0000-0000-000000000000, documents_count: 1, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap index 1172d1118..ae943bf48 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/after_registering_settings_task_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, status: enqueued, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [0,] diff --git a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap index 3653eeb9a..9ada7580a 100644 --- a/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap +++ b/crates/index-scheduler/src/scheduler/snapshots/test_embedders.rs/import_vectors/settings_update_processed_vectors.snap @@ -6,7 +6,7 @@ source: crates/index-scheduler/src/scheduler/test_embedders.rs [] ---------------------------------------------------------------------- ### All Tasks: -0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} +0 {uid: 0, batch_uid: 0, status: succeeded, details: { settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData } }, kind: SettingsUpdate { index_uid: "doggos", new_settings: Settings { displayed_attributes: WildcardSetting(NotSet), searchable_attributes: WildcardSetting(NotSet), filterable_attributes: NotSet, sortable_attributes: NotSet, ranking_rules: NotSet, stop_words: NotSet, non_separator_tokens: NotSet, separator_tokens: NotSet, dictionary: NotSet, synonyms: NotSet, distinct_attribute: NotSet, proximity_precision: NotSet, typo_tolerance: NotSet, faceting: NotSet, pagination: NotSet, embedders: Set({"A_fakerest": Set(EmbeddingSettings { source: Set(Rest), model: NotSet, revision: NotSet, pooling: NotSet, api_key: Set("My super secret"), dimensions: Set(384), binary_quantized: NotSet, document_template: NotSet, document_template_max_bytes: NotSet, url: Set("http://localhost:7777"), indexing_fragments: NotSet, search_fragments: NotSet, request: Set(String("{{text}}")), response: Set(String("{{embedding}}")), headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet }), "B_small_hf": Set(EmbeddingSettings { source: Set(HuggingFace), model: Set("sentence-transformers/all-MiniLM-L6-v2"), revision: Set("e4ce9877abf3edfe10b0d82785e83bdcb973e22e"), pooling: NotSet, api_key: NotSet, dimensions: NotSet, binary_quantized: NotSet, document_template: Set("{{doc.doggo}} the {{doc.breed}} best doggo"), document_template_max_bytes: NotSet, url: NotSet, indexing_fragments: NotSet, search_fragments: NotSet, request: NotSet, response: NotSet, headers: NotSet, search_embedder: NotSet, indexing_embedder: NotSet, distribution: NotSet })}), search_cutoff_ms: NotSet, localized_attributes: NotSet, facet_search: NotSet, prefix_search: NotSet, chat: NotSet, _kind: PhantomData }, is_deletion: false, allow_index_creation: true }} ---------------------------------------------------------------------- ### Status: enqueued [] diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index ee26165c7..e9f21dfe4 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -3,11 +3,11 @@ use std::collections::BTreeMap; use big_s::S; use meili_snap::{json_string, snapshot}; use meilisearch_auth::AuthFilter; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::IndexDocumentsMethod::*; use meilisearch_types::milli::{self}; use meilisearch_types::settings::SettingEmbeddingSettings; use meilisearch_types::tasks::{IndexSwap, KindWithContent}; +use milli::vector::db::IndexEmbeddingConfig; use roaring::RoaringBitmap; use crate::insta_snapshot::snapshot_index_scheduler; @@ -690,11 +690,20 @@ fn test_settings_update() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); - let configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name, config, user_provided } = configs.first().unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); + let IndexEmbeddingConfig { name, config, fragments } = configs.first().unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"default"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); insta::assert_json_snapshot!(config.embedder_options); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); } #[test] diff --git a/crates/index-scheduler/src/scheduler/test_embedders.rs b/crates/index-scheduler/src/scheduler/test_embedders.rs index 305894d0a..a9b920bd2 100644 --- a/crates/index-scheduler/src/scheduler/test_embedders.rs +++ b/crates/index-scheduler/src/scheduler/test_embedders.rs @@ -3,13 +3,14 @@ use std::collections::BTreeMap; use big_s::S; use insta::assert_json_snapshot; use meili_snap::{json_string, snapshot}; -use meilisearch_types::milli::index::IndexEmbeddingConfig; use meilisearch_types::milli::update::Setting; use meilisearch_types::milli::vector::settings::EmbeddingSettings; +use meilisearch_types::milli::vector::SearchQuery; use meilisearch_types::milli::{self, obkv_to_json}; use meilisearch_types::settings::{SettingEmbeddingSettings, Settings, Unchecked}; use meilisearch_types::tasks::KindWithContent; use milli::update::IndexDocumentsMethod::*; +use milli::vector::db::IndexEmbeddingConfig; use crate::insta_snapshot::snapshot_index_scheduler; use crate::test_utils::read_json; @@ -85,28 +86,51 @@ fn import_vectors() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); - let configs = index.embedding_configs(&rtxn).unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: fakerest_config, user_provided } = + let IndexEmbeddingConfig { name, config: fakerest_config, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); insta::assert_json_snapshot!(fakerest_config.embedder_options); let fakerest_name = name.clone(); - let IndexEmbeddingConfig { name, config: simple_hf_config, user_provided } = + let IndexEmbeddingConfig { name, config: simple_hf_config, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); insta::assert_json_snapshot!(simple_hf_config.embedder_options); let simple_hf_name = name.clone(); let configs = index_scheduler.embedders("doggos".to_string(), configs).unwrap(); - let (hf_embedder, _, _) = configs.get(&simple_hf_name).unwrap(); - let beagle_embed = hf_embedder.embed_search("Intel the beagle best doggo", None).unwrap(); - let lab_embed = hf_embedder.embed_search("Max the lab best doggo", None).unwrap(); - let patou_embed = hf_embedder.embed_search("kefir the patou best doggo", None).unwrap(); + let hf_runtime = configs.get(&simple_hf_name).unwrap(); + let hf_embedder = &hf_runtime.embedder; + let beagle_embed = hf_embedder + .embed_search(SearchQuery::Text("Intel the beagle best doggo"), None) + .unwrap(); + let lab_embed = + hf_embedder.embed_search(SearchQuery::Text("Max the lab best doggo"), None).unwrap(); + let patou_embed = hf_embedder + .embed_search(SearchQuery::Text("kefir the patou best doggo"), None) + .unwrap(); (fakerest_name, simple_hf_name, beagle_embed, lab_embed, patou_embed) }; @@ -166,22 +190,38 @@ fn import_vectors() { let rtxn = index.read_txn().unwrap(); // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); + let embedders = index.embedding_configs(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); - let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); let embeddings = index.embeddings(&rtxn, 0).unwrap(); - assert_json_snapshot!(embeddings[&simple_hf_name][0] == lab_embed, @"true"); - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == lab_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -239,25 +279,41 @@ fn import_vectors() { let index = index_scheduler.index("doggos").unwrap(); let rtxn = index.read_txn().unwrap(); + let embedders = index.embedding_configs(); // Ensure the document have been inserted into the relevant bitamp - let configs = index.embedding_configs(&rtxn).unwrap(); + let configs = embedders.embedding_configs(&rtxn).unwrap(); // for consistency with the below #[allow(clippy::get_first)] - let IndexEmbeddingConfig { name, config: _, user_provided: user_defined } = - configs.get(0).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(0).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0]>"); insta::assert_snapshot!(name, @"A_fakerest"); - insta::assert_debug_snapshot!(user_defined, @"RoaringBitmap<[0]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); - let IndexEmbeddingConfig { name, config: _, user_provided } = configs.get(1).unwrap(); + let IndexEmbeddingConfig { name, config: _, fragments } = configs.get(1).unwrap(); + let info = embedders.embedder_info(&rtxn, name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"1"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[]>"); insta::assert_snapshot!(name, @"B_small_hf"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); let embeddings = index.embeddings(&rtxn, 0).unwrap(); // automatically changed to patou because set to regenerate - assert_json_snapshot!(embeddings[&simple_hf_name][0] == patou_embed, @"true"); + assert_json_snapshot!(embeddings[&simple_hf_name].0[0] == patou_embed, @"true"); // remained beagle - assert_json_snapshot!(embeddings[&fakerest_name][0] == beagle_embed, @"true"); + assert_json_snapshot!(embeddings[&fakerest_name].0[0] == beagle_embed, @"true"); let doc = index.documents(&rtxn, std::iter::once(0)).unwrap()[0].1; let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); @@ -400,7 +456,7 @@ fn import_vectors_first_and_embedder_later() { // the all the vectors linked to the new specified embedder have been removed // Only the unknown embedders stays in the document DB snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"},{"id":1,"doggo":"intel","_vectors":{"unknown embedder":[1,2,3]}},{"id":2,"doggo":"max","_vectors":{"unknown embedder":[4,5]}},{"id":3,"doggo":"marcel"},{"id":4,"doggo":"sora"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); // even though we specified the vector for the ID 3, it shouldn't be marked // as user provided since we explicitely marked it as NOT user provided. snapshot!(format!("{conf:#?}"), @r###" @@ -426,19 +482,28 @@ fn import_vectors_first_and_embedder_later() { }, quantized: None, }, - user_provided: RoaringBitmap<[1, 2]>, + fragments: FragmentConfigs( + [], + ), }, ] "###); + let info = + index.embedding_configs().embedder_info(&rtxn, "my_doggo_embedder").unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[1, 2, 3]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[1, 2]>"); + let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty(), "{embedding:?}"); // the document with the id 3 should keep its original embedding let docid = index.external_documents_ids.get(&rtxn, "3").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embeddings = &embeddings["my_doggo_embedder"]; + let (embeddings, _) = &embeddings["my_doggo_embedder"]; snapshot!(embeddings.len(), @"1"); assert!(embeddings[0].iter().all(|i| *i == 3.0), "{:?}", embeddings[0]); @@ -493,7 +558,7 @@ fn import_vectors_first_and_embedder_later() { "###); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); assert!(!embedding[0].iter().all(|i| *i == 3.0), "{:?}", embedding[0]); @@ -501,7 +566,7 @@ fn import_vectors_first_and_embedder_later() { // the document with the id 4 should generate an embedding let docid = index.external_documents_ids.get(&rtxn, "4").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["my_doggo_embedder"]; + let (embedding, _) = &embeddings["my_doggo_embedder"]; assert!(!embedding.is_empty()); } @@ -603,33 +668,35 @@ fn delete_document_containing_vector() { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @r###"[{"id":0,"doggo":"kefir"}]"###); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, }, - quantized: None, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), }, - user_provided: RoaringBitmap<[0]>, + quantized: None, }, - ] - "###); + fragments: FragmentConfigs( + [], + ), + }, + ] + "###); let docid = index.external_documents_ids.get(&rtxn, "0").unwrap().unwrap(); let embeddings = index.embeddings(&rtxn, docid).unwrap(); - let embedding = &embeddings["manual"]; + let (embedding, _) = &embeddings["manual"]; assert!(!embedding.is_empty(), "{embedding:?}"); index_scheduler @@ -647,30 +714,32 @@ fn delete_document_containing_vector() { .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .collect::>(); snapshot!(serde_json::to_string(&documents).unwrap(), @"[]"); - let conf = index.embedding_configs(&rtxn).unwrap(); + let conf = index.embedding_configs().embedding_configs(&rtxn).unwrap(); snapshot!(format!("{conf:#?}"), @r###" - [ - IndexEmbeddingConfig { - name: "manual", - config: EmbeddingConfig { - embedder_options: UserProvided( - EmbedderOptions { - dimensions: 3, - distribution: None, - }, - ), - prompt: PromptData { - template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", - max_bytes: Some( - 400, - ), + [ + IndexEmbeddingConfig { + name: "manual", + config: EmbeddingConfig { + embedder_options: UserProvided( + EmbedderOptions { + dimensions: 3, + distribution: None, }, - quantized: None, + ), + prompt: PromptData { + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", + max_bytes: Some( + 400, + ), }, - user_provided: RoaringBitmap<[]>, + quantized: None, }, - ] - "###); + fragments: FragmentConfigs( + [], + ), + }, + ] + "###); } #[test] diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 30f6868f6..c57e2d042 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -301,6 +301,7 @@ InvalidFacetSearchQuery , InvalidRequest , BAD_REQU InvalidFacetSearchName , InvalidRequest , BAD_REQUEST ; FacetSearchDisabled , InvalidRequest , BAD_REQUEST ; InvalidSearchVector , InvalidRequest , BAD_REQUEST ; +InvalidSearchMedia , InvalidRequest , BAD_REQUEST ; InvalidSearchShowMatchesPosition , InvalidRequest , BAD_REQUEST ; InvalidSearchShowRankingScore , InvalidRequest , BAD_REQUEST ; InvalidSimilarShowRankingScore , InvalidRequest , BAD_REQUEST ; @@ -308,6 +309,7 @@ InvalidSearchShowRankingScoreDetails , InvalidRequest , BAD_REQU InvalidSimilarShowRankingScoreDetails , InvalidRequest , BAD_REQUEST ; InvalidSearchSort , InvalidRequest , BAD_REQUEST ; InvalidSearchDistinct , InvalidRequest , BAD_REQUEST ; +InvalidSearchMediaAndVector , InvalidRequest , BAD_REQUEST ; InvalidSettingsDisplayedAttributes , InvalidRequest , BAD_REQUEST ; InvalidSettingsDistinctAttribute , InvalidRequest , BAD_REQUEST ; InvalidSettingsProximityPrecision , InvalidRequest , BAD_REQUEST ; @@ -464,6 +466,7 @@ impl ErrorCode for milli::Error { | UserError::MissingSourceForNested { .. } | UserError::InvalidSettingsEmbedder { .. } => Code::InvalidSettingsEmbedders, UserError::TooManyEmbedders(_) => Code::InvalidSettingsEmbedders, + UserError::TooManyFragments(_) => Code::InvalidSettingsEmbedders, UserError::InvalidPromptForEmbeddings(..) => Code::InvalidSettingsEmbedders, UserError::NoPrimaryKeyCandidateFound => Code::IndexPrimaryKeyNoCandidateFound, UserError::MultiplePrimaryKeyCandidatesFound { .. } => { diff --git a/crates/meilisearch-types/src/features.rs b/crates/meilisearch-types/src/features.rs index 9ec2d321f..3c78035e8 100644 --- a/crates/meilisearch-types/src/features.rs +++ b/crates/meilisearch-types/src/features.rs @@ -21,6 +21,7 @@ pub struct RuntimeTogglableFeatures { pub get_task_documents_route: bool, pub composite_embedders: bool, pub chat_completions: bool, + pub multimodal: bool, } #[derive(Default, Debug, Clone, Copy)] diff --git a/crates/meilisearch-types/src/settings.rs b/crates/meilisearch-types/src/settings.rs index 7d64440ce..9e107a5c3 100644 --- a/crates/meilisearch-types/src/settings.rs +++ b/crates/meilisearch-types/src/settings.rs @@ -9,10 +9,11 @@ use std::str::FromStr; use deserr::{DeserializeError, Deserr, ErrorKind, MergeWithError, ValuePointerRef}; use fst::IntoStreamer; use milli::disabled_typos_terms::DisabledTyposTerms; -use milli::index::{IndexEmbeddingConfig, PrefixSearch}; +use milli::index::PrefixSearch; use milli::proximity::ProximityPrecision; pub use milli::update::ChatSettings; use milli::update::Setting; +use milli::vector::db::IndexEmbeddingConfig; use milli::{Criterion, CriterionError, FilterableAttributesRule, Index, DEFAULT_VALUES_PER_FACET}; use serde::{Deserialize, Serialize, Serializer}; use utoipa::ToSchema; @@ -500,8 +501,11 @@ impl Settings { let Setting::Set(mut configs) = self.embedders else { return Ok(self) }; for (name, config) in configs.iter_mut() { let config_to_check = std::mem::take(config); - let checked_config = - milli::update::validate_embedding_settings(config_to_check.inner, name)?; + let checked_config = milli::update::validate_embedding_settings( + config_to_check.inner, + name, + milli::vector::settings::EmbeddingValidationContext::SettingsPartialUpdate, + )?; *config = SettingEmbeddingSettings { inner: checked_config }; } self.embedders = Setting::Set(configs); @@ -911,6 +915,7 @@ pub fn settings( }; let embedders: BTreeMap<_, _> = index + .embedding_configs() .embedding_configs(rtxn)? .into_iter() .map(|IndexEmbeddingConfig { name, config, .. }| { diff --git a/crates/meilisearch/src/analytics/segment_analytics.rs b/crates/meilisearch/src/analytics/segment_analytics.rs index 668a7fded..0abc5c817 100644 --- a/crates/meilisearch/src/analytics/segment_analytics.rs +++ b/crates/meilisearch/src/analytics/segment_analytics.rs @@ -197,6 +197,7 @@ struct Infos { experimental_max_number_of_batched_tasks: usize, experimental_limit_batched_tasks_total_size: u64, experimental_network: bool, + experimental_multimodal: bool, experimental_chat_completions: bool, experimental_get_task_documents_route: bool, experimental_composite_embedders: bool, @@ -303,6 +304,7 @@ impl Infos { get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = features; // We're going to override every sensible information. @@ -322,6 +324,7 @@ impl Infos { experimental_reduce_indexing_memory_usage, experimental_network: network, experimental_chat_completions: chat_completions, + experimental_multimodal: multimodal, experimental_get_task_documents_route: get_task_documents_route, experimental_composite_embedders: composite_embedders, experimental_embedding_cache_entries, diff --git a/crates/meilisearch/src/error.rs b/crates/meilisearch/src/error.rs index b13eb8d7c..91c6c23fa 100644 --- a/crates/meilisearch/src/error.rs +++ b/crates/meilisearch/src/error.rs @@ -76,8 +76,10 @@ pub enum MeilisearchHttpError { DocumentFormat(#[from] DocumentFormatError), #[error(transparent)] Join(#[from] JoinError), - #[error("Invalid request: missing `hybrid` parameter when `vector` is present.")] + #[error("Invalid request: missing `hybrid` parameter when `vector` or `media` are present.")] MissingSearchHybrid, + #[error("Invalid request: both `media` and `vector` parameters are present.")] + MediaAndVector, } impl MeilisearchHttpError { @@ -111,6 +113,7 @@ impl ErrorCode for MeilisearchHttpError { MeilisearchHttpError::DocumentFormat(e) => e.error_code(), MeilisearchHttpError::Join(_) => Code::Internal, MeilisearchHttpError::MissingSearchHybrid => Code::MissingSearchHybrid, + MeilisearchHttpError::MediaAndVector => Code::InvalidSearchMediaAndVector, MeilisearchHttpError::FederationOptionsInNonFederatedRequest(_) => { Code::InvalidMultiSearchFederationOptions } diff --git a/crates/meilisearch/src/lib.rs b/crates/meilisearch/src/lib.rs index 871bd688e..e1acef2ce 100644 --- a/crates/meilisearch/src/lib.rs +++ b/crates/meilisearch/src/lib.rs @@ -563,7 +563,7 @@ fn import_dump( let reader = BufReader::new(file); let reader = DocumentsBatchReader::from_reader(reader)?; - let embedder_configs = index.embedding_configs(&wtxn)?; + let embedder_configs = index.embedding_configs().embedding_configs(&wtxn)?; let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?; let builder = milli::update::IndexDocuments::new( diff --git a/crates/meilisearch/src/routes/features.rs b/crates/meilisearch/src/routes/features.rs index 179b9cf68..1a1f89b2d 100644 --- a/crates/meilisearch/src/routes/features.rs +++ b/crates/meilisearch/src/routes/features.rs @@ -54,6 +54,7 @@ pub fn configure(cfg: &mut web::ServiceConfig) { get_task_documents_route: Some(false), composite_embedders: Some(false), chat_completions: Some(false), + multimodal: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -100,6 +101,8 @@ pub struct RuntimeTogglableFeatures { pub composite_embedders: Option, #[deserr(default)] pub chat_completions: Option, + #[deserr(default)] + pub multimodal: Option, } impl From for RuntimeTogglableFeatures { @@ -113,6 +116,7 @@ impl From for RuntimeTogg get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = value; Self { @@ -124,6 +128,7 @@ impl From for RuntimeTogg get_task_documents_route: Some(get_task_documents_route), composite_embedders: Some(composite_embedders), chat_completions: Some(chat_completions), + multimodal: Some(multimodal), } } } @@ -138,6 +143,7 @@ pub struct PatchExperimentalFeatureAnalytics { get_task_documents_route: bool, composite_embedders: bool, chat_completions: bool, + multimodal: bool, } impl Aggregate for PatchExperimentalFeatureAnalytics { @@ -155,6 +161,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { get_task_documents_route: new.get_task_documents_route, composite_embedders: new.composite_embedders, chat_completions: new.chat_completions, + multimodal: new.multimodal, }) } @@ -181,6 +188,7 @@ impl Aggregate for PatchExperimentalFeatureAnalytics { get_task_documents_route: Some(false), composite_embedders: Some(false), chat_completions: Some(false), + multimodal: Some(false), })), (status = 401, description = "The authorization header is missing", body = ResponseError, content_type = "application/json", example = json!( { @@ -223,6 +231,7 @@ async fn patch_features( .composite_embedders .unwrap_or(old_features.composite_embedders), chat_completions: new_features.0.chat_completions.unwrap_or(old_features.chat_completions), + multimodal: new_features.0.multimodal.unwrap_or(old_features.multimodal), }; // explicitly destructure for analytics rather than using the `Serialize` implementation, because @@ -237,6 +246,7 @@ async fn patch_features( get_task_documents_route, composite_embedders, chat_completions, + multimodal, } = new_features; analytics.publish( @@ -249,6 +259,7 @@ async fn patch_features( get_task_documents_route, composite_embedders, chat_completions, + multimodal, }, &req, ); diff --git a/crates/meilisearch/src/routes/indexes/documents.rs b/crates/meilisearch/src/routes/indexes/documents.rs index 50eec46fe..a93d736f7 100644 --- a/crates/meilisearch/src/routes/indexes/documents.rs +++ b/crates/meilisearch/src/routes/indexes/documents.rs @@ -1452,7 +1452,6 @@ fn some_documents<'a, 't: 'a>( ) -> Result> + 'a, ResponseError> { let fields_ids_map = index.fields_ids_map(rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(rtxn)?; Ok(index.iter_documents(rtxn, doc_ids)?.map(move |ret| { ret.map_err(ResponseError::from).and_then(|(key, document)| -> Result<_, ResponseError> { @@ -1468,15 +1467,9 @@ fn some_documents<'a, 't: 'a>( Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, vector) in index.embeddings(rtxn, key)? { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(key)); - let embeddings = ExplicitVectors { - embeddings: Some(vector.into()), - regenerate: !user_provided, - }; + for (name, (vector, regenerate)) in index.embeddings(rtxn, key)? { + let embeddings = + ExplicitVectors { embeddings: Some(vector.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(MeilisearchHttpError::from)?, diff --git a/crates/meilisearch/src/routes/indexes/facet_search.rs b/crates/meilisearch/src/routes/indexes/facet_search.rs index 41f306746..18ad54ccf 100644 --- a/crates/meilisearch/src/routes/indexes/facet_search.rs +++ b/crates/meilisearch/src/routes/indexes/facet_search.rs @@ -56,6 +56,8 @@ pub struct FacetSearchQuery { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] @@ -94,6 +96,7 @@ impl FacetSearchAggregator { facet_name, vector, q, + media, filter, matching_strategy, attributes_to_search_on, @@ -108,6 +111,7 @@ impl FacetSearchAggregator { facet_names: Some(facet_name.clone()).into_iter().collect(), additional_search_parameters_provided: q.is_some() || vector.is_some() + || media.is_some() || filter.is_some() || *matching_strategy != MatchingStrategy::default() || attributes_to_search_on.is_some() @@ -291,6 +295,7 @@ impl From for SearchQuery { facet_name: _, q, vector, + media, filter, matching_strategy, attributes_to_search_on, @@ -312,6 +317,7 @@ impl From for SearchQuery { SearchQuery { q, + media, offset: DEFAULT_SEARCH_OFFSET(), limit: DEFAULT_SEARCH_LIMIT(), page, diff --git a/crates/meilisearch/src/routes/indexes/search.rs b/crates/meilisearch/src/routes/indexes/search.rs index 333ae1944..697ae9241 100644 --- a/crates/meilisearch/src/routes/indexes/search.rs +++ b/crates/meilisearch/src/routes/indexes/search.rs @@ -205,6 +205,8 @@ impl TryFrom for SearchQuery { Ok(Self { q: other.q, + // `media` not supported for `GET` + media: None, vector: other.vector.map(CS::into_inner), offset: other.offset.0, limit: other.limit.0, @@ -481,28 +483,30 @@ pub fn search_kind( index_uid: String, index: &milli::Index, ) -> Result { + let is_placeholder_query = + if let Some(q) = query.q.as_deref() { q.trim().is_empty() } else { true }; + let non_placeholder_query = !is_placeholder_query; + let is_media = query.media.is_some(); // handle with care, the order of cases matters, the semantics is subtle - match (query.q.as_deref(), &query.hybrid, query.vector.as_deref()) { - // empty query, no vector => placeholder search - (Some(q), _, None) if q.trim().is_empty() => Ok(SearchKind::KeywordOnly), - // no query, no vector => placeholder search - (None, _, None) => Ok(SearchKind::KeywordOnly), - // hybrid.semantic_ratio == 1.0 => vector - (_, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { - SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) - } - // hybrid.semantic_ratio == 0.0 => keyword - (_, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { + match (is_media, non_placeholder_query, &query.hybrid, query.vector.as_deref()) { + // media + vector => error + (true, _, _, Some(_)) => Err(MeilisearchHttpError::MediaAndVector.into()), + // media + !hybrid => error + (true, _, None, _) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // vector + !hybrid => error + (_, _, None, Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // hybrid S0 => keyword + (_, _, Some(HybridQuery { semantic_ratio, embedder: _ }), _) if **semantic_ratio == 0.0 => { Ok(SearchKind::KeywordOnly) } - // no query, hybrid, vector => semantic - (None, Some(HybridQuery { semantic_ratio: _, embedder }), Some(v)) => { - SearchKind::semantic(index_scheduler, index_uid, index, embedder, Some(v.len())) + // !q + !vector => placeholder search + (false, false, _, None) => Ok(SearchKind::KeywordOnly), + // hybrid S100 => semantic + (_, _, Some(HybridQuery { semantic_ratio, embedder }), v) if **semantic_ratio == 1.0 => { + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) } - // query, no hybrid, no vector => keyword - (Some(_), None, None) => Ok(SearchKind::KeywordOnly), - // query, hybrid, maybe vector => hybrid - (Some(_), Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( + // q + hybrid => hybrid + (_, true, Some(HybridQuery { semantic_ratio, embedder }), v) => SearchKind::hybrid( index_scheduler, index_uid, index, @@ -510,7 +514,11 @@ pub fn search_kind( **semantic_ratio, v.map(|v| v.len()), ), - - (_, None, Some(_)) => Err(MeilisearchHttpError::MissingSearchHybrid.into()), + // !q + hybrid => semantic + (_, false, Some(HybridQuery { semantic_ratio: _, embedder }), v) => { + SearchKind::semantic(index_scheduler, index_uid, index, embedder, v.map(|v| v.len())) + } + // q => keyword + (false, true, None, None) => Ok(SearchKind::KeywordOnly), } } diff --git a/crates/meilisearch/src/routes/indexes/search_analytics.rs b/crates/meilisearch/src/routes/indexes/search_analytics.rs index b16e2636e..07f79eba7 100644 --- a/crates/meilisearch/src/routes/indexes/search_analytics.rs +++ b/crates/meilisearch/src/routes/indexes/search_analytics.rs @@ -61,6 +61,8 @@ pub struct SearchAggregator { semantic_ratio: bool, hybrid: bool, retrieve_vectors: bool, + // Number of requests containing `media` + total_media: usize, // every time a search is done, we increment the counter linked to the used settings matching_strategy: HashMap, @@ -101,6 +103,7 @@ impl SearchAggregator { let SearchQuery { q, vector, + media, offset, limit, page, @@ -175,6 +178,11 @@ impl SearchAggregator { if let Some(ref vector) = vector { ret.max_vector_size = vector.len(); } + + if media.is_some() { + ret.total_media = 1; + } + ret.retrieve_vectors |= retrieve_vectors; if query.is_finite_pagination() { @@ -277,6 +285,7 @@ impl Aggregate for SearchAggregator { show_ranking_score_details, semantic_ratio, hybrid, + total_media, total_degraded, total_used_negative_operator, ranking_score_threshold, @@ -327,6 +336,7 @@ impl Aggregate for SearchAggregator { self.retrieve_vectors |= retrieve_vectors; self.semantic_ratio |= semantic_ratio; self.hybrid |= hybrid; + self.total_media += total_media; // pagination self.max_limit = self.max_limit.max(max_limit); @@ -403,6 +413,7 @@ impl Aggregate for SearchAggregator { show_ranking_score_details, semantic_ratio, hybrid, + total_media, total_degraded, total_used_negative_operator, ranking_score_threshold, @@ -450,6 +461,7 @@ impl Aggregate for SearchAggregator { "hybrid": { "enabled": hybrid, "semantic_ratio": semantic_ratio, + "total_media": total_media, }, "pagination": { "max_limit": max_limit, diff --git a/crates/meilisearch/src/routes/indexes/settings.rs b/crates/meilisearch/src/routes/indexes/settings.rs index a4b7a5219..308977a6e 100644 --- a/crates/meilisearch/src/routes/indexes/settings.rs +++ b/crates/meilisearch/src/routes/indexes/settings.rs @@ -755,6 +755,14 @@ fn validate_settings( if matches!(embedder.indexing_embedder, Setting::Set(_)) { features.check_composite_embedders("setting `indexingEmbedder`")?; } + + if matches!(embedder.indexing_fragments, Setting::Set(_)) { + features.check_multimodal("setting `indexingFragments`")?; + } + + if matches!(embedder.search_fragments, Setting::Set(_)) { + features.check_multimodal("setting `searchFragments`")?; + } } } diff --git a/crates/meilisearch/src/routes/multi_search_analytics.rs b/crates/meilisearch/src/routes/multi_search_analytics.rs index 3fa23f630..c24875797 100644 --- a/crates/meilisearch/src/routes/multi_search_analytics.rs +++ b/crates/meilisearch/src/routes/multi_search_analytics.rs @@ -42,6 +42,7 @@ impl MultiSearchAggregator { federation_options, q: _, vector: _, + media: _, offset: _, limit: _, page: _, diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 5e543c53f..1c987a70c 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -64,6 +64,8 @@ pub struct SearchQuery { pub q: Option, #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default = DEFAULT_SEARCH_OFFSET(), error = DeserrJsonError)] @@ -147,6 +149,7 @@ impl From for SearchQuery { ranking_score_threshold: ranking_score_threshold.map(RankingScoreThreshold::from), q: None, vector: None, + media: None, offset: DEFAULT_SEARCH_OFFSET(), page: None, hits_per_page: None, @@ -220,6 +223,7 @@ impl fmt::Debug for SearchQuery { let Self { q, vector, + media, hybrid, offset, limit, @@ -274,6 +278,9 @@ impl fmt::Debug for SearchQuery { ); } } + if let Some(media) = media { + debug.field("media", media); + } if let Some(hybrid) = hybrid { debug.field("hybrid", &hybrid); } @@ -399,10 +406,10 @@ impl SearchKind { route: Route, ) -> Result<(String, Arc, bool), ResponseError> { let rtxn = index.read_txn()?; - let embedder_configs = index.embedding_configs(&rtxn)?; + let embedder_configs = index.embedding_configs().embedding_configs(&rtxn)?; let embedders = index_scheduler.embedders(index_uid, embedder_configs)?; - let (embedder, _, quantized) = embedders + let (embedder, quantized) = embedders .get(embedder_name) .ok_or(match route { Route::Search | Route::MultiSearch => { @@ -412,6 +419,7 @@ impl SearchKind { milli::UserError::InvalidSimilarEmbedder(embedder_name.to_owned()) } }) + .map(|runtime| (runtime.embedder.clone(), runtime.is_quantized)) .map_err(milli::Error::from)?; if let Some(vector_len) = vector_len { @@ -481,8 +489,10 @@ pub struct SearchQueryWithIndex { pub index_uid: IndexUid, #[deserr(default, error = DeserrJsonError)] pub q: Option, - #[deserr(default, error = DeserrJsonError)] + #[deserr(default, error = DeserrJsonError)] pub vector: Option>, + #[deserr(default, error = DeserrJsonError)] + pub media: Option, #[deserr(default, error = DeserrJsonError)] pub hybrid: Option, #[deserr(default, error = DeserrJsonError)] @@ -563,6 +573,7 @@ impl SearchQueryWithIndex { let SearchQuery { q, vector, + media, hybrid, offset, limit, @@ -593,6 +604,7 @@ impl SearchQueryWithIndex { index_uid, q, vector, + media, hybrid, offset: if offset == DEFAULT_SEARCH_OFFSET() { None } else { Some(offset) }, limit: if limit == DEFAULT_SEARCH_LIMIT() { None } else { Some(limit) }, @@ -627,6 +639,7 @@ impl SearchQueryWithIndex { federation_options, q, vector, + media, offset, limit, page, @@ -657,6 +670,7 @@ impl SearchQueryWithIndex { SearchQuery { q, vector, + media, offset: offset.unwrap_or(DEFAULT_SEARCH_OFFSET()), limit: limit.unwrap_or(DEFAULT_SEARCH_LIMIT()), page, @@ -958,6 +972,9 @@ pub fn prepare_search<'t>( time_budget: TimeBudget, features: RoFeatures, ) -> Result<(milli::Search<'t>, bool, usize, usize), ResponseError> { + if query.media.is_some() { + features.check_multimodal("passing `media` in a search query")?; + } let mut search = index.search(rtxn); search.time_budget(time_budget); if let Some(ranking_score_threshold) = query.ranking_score_threshold { @@ -983,14 +1000,27 @@ pub fn prepare_search<'t>( let deadline = std::time::Instant::now() + std::time::Duration::from_secs(10); + let q = query.q.as_deref(); + let media = query.media.as_ref(); + + let search_query = match (q, media) { + (Some(text), None) => milli::vector::SearchQuery::Text(text), + (q, media) => milli::vector::SearchQuery::Media { q, media }, + }; + embedder - .embed_search(query.q.as_ref().unwrap(), Some(deadline)) + .embed_search(search_query, Some(deadline)) .map_err(milli::vector::Error::from) .map_err(milli::Error::from)? } }; - - search.semantic(embedder_name.clone(), embedder.clone(), *quantized, Some(vector)); + search.semantic( + embedder_name.clone(), + embedder.clone(), + *quantized, + Some(vector), + query.media.clone(), + ); } SearchKind::Hybrid { embedder_name, embedder, quantized, semantic_ratio: _ } => { if let Some(q) = &query.q { @@ -1002,6 +1032,7 @@ pub fn prepare_search<'t>( embedder.clone(), *quantized, query.vector.clone(), + query.media.clone(), ); } } @@ -1126,6 +1157,7 @@ pub fn perform_search( locales, // already used in prepare_search vector: _, + media: _, hybrid: _, offset: _, ranking_score_threshold: _, @@ -1328,7 +1360,6 @@ struct HitMaker<'a> { vectors_fid: Option, retrieve_vectors: RetrieveVectors, to_retrieve_ids: BTreeSet, - embedding_configs: Vec, formatter_builder: MatcherBuilder<'a>, formatted_options: BTreeMap, show_ranking_score: bool, @@ -1443,8 +1474,6 @@ impl<'a> HitMaker<'a> { &displayed_ids, ); - let embedding_configs = index.embedding_configs(rtxn)?; - Ok(Self { index, rtxn, @@ -1453,7 +1482,6 @@ impl<'a> HitMaker<'a> { vectors_fid, retrieve_vectors, to_retrieve_ids, - embedding_configs, formatter_builder, formatted_options, show_ranking_score: format.show_ranking_score, @@ -1499,14 +1527,8 @@ impl<'a> HitMaker<'a> { Some(Value::Object(map)) => map, _ => Default::default(), }; - for (name, vector) in self.index.embeddings(self.rtxn, id)? { - let user_provided = self - .embedding_configs - .iter() - .find(|conf| conf.name == name) - .is_some_and(|conf| conf.user_provided.contains(id)); - let embeddings = - ExplicitVectors { embeddings: Some(vector.into()), regenerate: !user_provided }; + for (name, (vector, regenerate)) in self.index.embeddings(self.rtxn, id)? { + let embeddings = ExplicitVectors { embeddings: Some(vector.into()), regenerate }; vectors.insert( name, serde_json::to_value(embeddings).map_err(InternalError::SerdeJson)?, diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index 3d3bc01db..9b111186d 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -2188,7 +2188,8 @@ async fn import_dump_v6_containing_experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -2314,7 +2315,8 @@ async fn import_dump_v6_containing_batches_and_enqueued_tasks() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -2420,7 +2422,8 @@ async fn generate_and_import_dump_containing_vectors() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); diff --git a/crates/meilisearch/tests/features/mod.rs b/crates/meilisearch/tests/features/mod.rs index d0d457d3e..ec5838d35 100644 --- a/crates/meilisearch/tests/features/mod.rs +++ b/crates/meilisearch/tests/features/mod.rs @@ -25,7 +25,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -41,7 +42,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -57,7 +59,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -74,7 +77,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -91,7 +95,8 @@ async fn experimental_features() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); } @@ -115,7 +120,8 @@ async fn experimental_feature_metrics() { "network": false, "getTaskDocumentsRoute": false, "compositeEmbedders": false, - "chatCompletions": false + "chatCompletions": false, + "multimodal": false } "###); @@ -162,7 +168,7 @@ async fn errors() { meili_snap::snapshot!(code, @"400 Bad Request"); meili_snap::snapshot!(meili_snap::json_string!(response), @r###" { - "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`", + "message": "Unknown field `NotAFeature`: expected one of `metrics`, `logsRoute`, `editDocumentsByFunction`, `containsFilter`, `network`, `getTaskDocumentsRoute`, `compositeEmbedders`, `chatCompletions`, `multimodal`", "code": "bad_request", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#bad_request" diff --git a/crates/meilisearch/tests/search/hybrid.rs b/crates/meilisearch/tests/search/hybrid.rs index be2a724b0..d95e6fb64 100644 --- a/crates/meilisearch/tests/search/hybrid.rs +++ b/crates/meilisearch/tests/search/hybrid.rs @@ -499,7 +499,7 @@ async fn query_combination() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Invalid request: missing `hybrid` parameter when `vector` is present.", + "message": "Invalid request: missing `hybrid` parameter when `vector` or `media` are present.", "code": "missing_search_hybrid", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#missing_search_hybrid" diff --git a/crates/meilisearch/tests/vector/rest.rs b/crates/meilisearch/tests/vector/rest.rs index 87296c36a..e03563bcc 100644 --- a/crates/meilisearch/tests/vector/rest.rs +++ b/crates/meilisearch/tests/vector/rest.rs @@ -1,9 +1,9 @@ use std::collections::BTreeMap; use std::sync::atomic::AtomicUsize; +use std::time::Duration; use meili_snap::{json_string, snapshot}; use reqwest::IntoUrl; -use std::time::Duration; use tokio::sync::mpsc; use wiremock::matchers::{method, path}; use wiremock::{Mock, MockServer, Request, ResponseTemplate}; @@ -408,13 +408,13 @@ async fn bad_request() { .await; snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" - { - "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", - "code": "vector_embedding_error", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#vector_embedding_error" - } - "###); + { + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + } + "###); // A repeat string appears inside a repeated value let (response, code) = index @@ -437,7 +437,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input.input`: \"{{..}}\" appears nested inside of a value that is itself repeated", + "message": "Error while generating embeddings: user error: in `request.input.input`: \"{{..}}\" appears nested inside of a value that is itself repeated\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -460,7 +460,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input.repeat`: \"{{..}}\" appears outside of an array", + "message": "Error while generating embeddings: user error: in `request.input.repeat`: \"{{..}}\" appears outside of an array\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -483,7 +483,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #0", + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #0\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -506,7 +506,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #2", + "message": "Error while generating embeddings: user error: in `request.input`: \"{{..}}\" expected at position #1, but found at position #2\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -529,7 +529,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.input[0]`: Expected \"{{text}}\" inside of the repeated value", + "message": "Error while generating embeddings: user error: in `request.input[0]`: Expected \"{{text}}\" inside of the repeated value\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -556,7 +556,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{..}}\", but it was already present in `request.input`", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{..}}\", but it was already present in `request.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -577,7 +577,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input`", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -598,7 +598,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.repeated.data[1]`: Found \"{{text}}\", but it was already present in `request.repeated.input`", + "message": "Error while generating embeddings: user error: in `request.repeated.data[1]`: Found \"{{text}}\", but it was already present in `request.repeated.input`\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -619,7 +619,7 @@ async fn bad_request() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input[0]` (repeated)", + "message": "Error while generating embeddings: user error: in `request.data`: Found \"{{text}}\", but it was already present in `request.input[0]` (repeated)\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" @@ -920,7 +920,7 @@ async fn bad_settings() { snapshot!(code, @"400 Bad Request"); snapshot!(response, @r###" { - "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found", + "message": "Error while generating embeddings: user error: in `request`: \"{{text}}\" not found\n - Note: this template is using a document template, and so expects to contain the placeholder \"{{text}}\" rather than \"{{fragment}}\"", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" diff --git a/crates/meilitool/src/main.rs b/crates/meilitool/src/main.rs index dd1213782..b967e620c 100644 --- a/crates/meilitool/src/main.rs +++ b/crates/meilitool/src/main.rs @@ -545,7 +545,6 @@ fn export_documents( let rtxn = index.read_txn()?; let fields_ids_map = index.fields_ids_map(&rtxn)?; let all_fields: Vec<_> = fields_ids_map.iter().map(|(id, _)| id).collect(); - let embedding_configs = index.embedding_configs(&rtxn)?; if let Some(offset) = offset { eprintln!("Skipping {offset} documents"); @@ -592,17 +591,12 @@ fn export_documents( .into()); }; - for (embedder_name, embeddings) in embeddings { - let user_provided = embedding_configs - .iter() - .find(|conf| conf.name == embedder_name) - .is_some_and(|conf| conf.user_provided.contains(id)); - + for (embedder_name, (embeddings, regenerate)) in embeddings { let embeddings = ExplicitVectors { embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors( embeddings, )), - regenerate: !user_provided, + regenerate, }; vectors .insert(embedder_name, serde_json::to_value(embeddings).unwrap()); diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index 2136ec97e..f8886da8e 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -288,6 +288,8 @@ and can not be more than 511 bytes.", .document_id.to_string() InvalidPromptForEmbeddings(String, crate::prompt::error::NewPromptError), #[error("Too many embedders in the configuration. Found {0}, but limited to 256.")] TooManyEmbedders(usize), + #[error("Too many fragments in the configuration. Found {0}, but limited to 256.")] + TooManyFragments(usize), #[error("Cannot find embedder with name `{0}`.")] InvalidSearchEmbedder(String), #[error("Cannot find embedder with name `{0}`.")] diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index e9e63a853..b2ec992ba 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -30,7 +30,8 @@ use crate::order_by_map::OrderByMap; use crate::prompt::PromptData; use crate::proximity::ProximityPrecision; use crate::update::new::StdResult; -use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; +use crate::vector::db::IndexEmbeddingConfigs; +use crate::vector::{ArroyStats, ArroyWrapper, Embedding}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -177,7 +178,7 @@ pub struct Index { pub field_id_docid_facet_strings: Database, /// Maps an embedder name to its id in the arroy store. - pub embedder_category_id: Database, + pub(crate) embedder_category_id: Database, /// Vector store based on arroy™. pub vector_arroy: arroy::Database, @@ -1745,34 +1746,6 @@ impl Index { self.main.remap_key_type::().delete(txn, main_key::LOCALIZED_ATTRIBUTES_RULES) } - /// Put the embedding configs: - /// 1. The name of the embedder - /// 2. The configuration option for this embedder - /// 3. The list of documents with a user provided embedding - pub(crate) fn put_embedding_configs( - &self, - wtxn: &mut RwTxn<'_>, - configs: Vec, - ) -> heed::Result<()> { - self.main.remap_types::>>().put( - wtxn, - main_key::EMBEDDING_CONFIGS, - &configs, - ) - } - - pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { - self.main.remap_key_type::().delete(wtxn, main_key::EMBEDDING_CONFIGS) - } - - pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> Result> { - Ok(self - .main - .remap_types::>>() - .get(rtxn, main_key::EMBEDDING_CONFIGS)? - .unwrap_or_default()) - } - pub(crate) fn put_search_cutoff(&self, wtxn: &mut RwTxn<'_>, cutoff: u64) -> heed::Result<()> { self.main.remap_types::().put(wtxn, main_key::SEARCH_CUTOFF, &cutoff) } @@ -1785,19 +1758,29 @@ impl Index { self.main.remap_key_type::().delete(wtxn, main_key::SEARCH_CUTOFF) } + pub fn embedding_configs(&self) -> IndexEmbeddingConfigs { + IndexEmbeddingConfigs::new(self.main, self.embedder_category_id) + } + pub fn embeddings( &self, rtxn: &RoTxn<'_>, docid: DocumentId, - ) -> Result>> { + ) -> Result, bool)>> { let mut res = BTreeMap::new(); - let embedding_configs = self.embedding_configs(rtxn)?; - for config in embedding_configs { - let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); - let reader = - ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + let embedders = self.embedding_configs(); + for config in embedders.embedding_configs(rtxn)? { + let embedder_info = embedders.embedder_info(rtxn, &config.name)?.unwrap(); + let reader = ArroyWrapper::new( + self.vector_arroy, + embedder_info.embedder_id, + config.config.quantized(), + ); let embeddings = reader.item_vectors(rtxn, docid)?; - res.insert(config.name.to_owned(), embeddings); + res.insert( + config.name.to_owned(), + (embeddings, embedder_info.embedding_status.must_regenerate(docid)), + ); } Ok(res) } @@ -1809,9 +1792,9 @@ impl Index { pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { let mut stats = ArroyStats::default(); - let embedding_configs = self.embedding_configs(rtxn)?; - for config in embedding_configs { - let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); + let embedding_configs = self.embedding_configs(); + for config in embedding_configs.embedding_configs(rtxn)? { + let embedder_id = embedding_configs.embedder_id(rtxn, &config.name)?.unwrap(); let reader = ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); reader.aggregate_stats(rtxn, &mut stats)?; @@ -1936,13 +1919,6 @@ impl Index { } } -#[derive(Debug, Deserialize, Serialize)] -pub struct IndexEmbeddingConfig { - pub name: String, - pub config: EmbeddingConfig, - pub user_provided: RoaringBitmap, -} - #[derive(Debug, Default, Deserialize, Serialize)] pub struct ChatConfig { pub description: String, diff --git a/crates/milli/src/prompt/context.rs b/crates/milli/src/prompt/context.rs index 84523333a..8958cb693 100644 --- a/crates/milli/src/prompt/context.rs +++ b/crates/milli/src/prompt/context.rs @@ -6,12 +6,18 @@ use liquid::{ObjectView, ValueView}; #[derive(Debug, Clone)] pub struct Context<'a, D: ObjectView, F: ArrayView> { document: &'a D, - fields: &'a F, + fields: Option<&'a F>, } impl<'a, D: ObjectView, F: ArrayView> Context<'a, D, F> { pub fn new(document: &'a D, fields: &'a F) -> Self { - Self { document, fields } + Self { document, fields: Some(fields) } + } +} + +impl<'a, D: ObjectView> Context<'a, D, Vec> { + pub fn without_fields(document: &'a D) -> Self { + Self { document, fields: None } } } @@ -21,17 +27,27 @@ impl ObjectView for Context<'_, D, F> { } fn size(&self) -> i64 { - 2 + if self.fields.is_some() { + 2 + } else { + 1 + } } fn keys<'k>(&'k self) -> Box> + 'k> { - Box::new(["doc", "fields"].iter().map(|s| KStringCow::from_static(s))) + let keys = if self.fields.is_some() { + either::Either::Left(["doc", "fields"]) + } else { + either::Either::Right(["doc"]) + }; + + Box::new(keys.into_iter().map(KStringCow::from_static)) } fn values<'k>(&'k self) -> Box + 'k> { Box::new( std::iter::once(self.document.as_value()) - .chain(std::iter::once(self.fields.as_value())), + .chain(self.fields.iter().map(|fields| fields.as_value())), ) } @@ -40,13 +56,13 @@ impl ObjectView for Context<'_, D, F> { } fn contains_key(&self, index: &str) -> bool { - index == "doc" || index == "fields" + index == "doc" || (index == "fields" && self.fields.is_some()) } fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> { - match index { - "doc" => Some(self.document.as_value()), - "fields" => Some(self.fields.as_value()), + match (index, &self.fields) { + ("doc", _) => Some(self.document.as_value()), + ("fields", Some(fields)) => Some(fields.as_value()), _ => None, } } diff --git a/crates/milli/src/prompt/document.rs b/crates/milli/src/prompt/document.rs index b00c4cb42..1125c8fba 100644 --- a/crates/milli/src/prompt/document.rs +++ b/crates/milli/src/prompt/document.rs @@ -144,18 +144,19 @@ impl ValueView for Document<'_> { use crate::update::new::document::Document as DocumentTrait; #[derive(Debug)] -pub struct ParseableDocument<'doc, D> { +pub struct ParseableDocument<'a, 'doc, D: DocumentTrait<'a> + Debug> { document: D, doc_alloc: &'doc Bump, + _marker: std::marker::PhantomData<&'a ()>, } -impl<'doc, D> ParseableDocument<'doc, D> { +impl<'a, 'doc, D: DocumentTrait<'a> + Debug> ParseableDocument<'a, 'doc, D> { pub fn new(document: D, doc_alloc: &'doc Bump) -> Self { - Self { document, doc_alloc } + Self { document, doc_alloc, _marker: std::marker::PhantomData } } } -impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc, D> { +impl<'a, D: DocumentTrait<'a> + Debug> ObjectView for ParseableDocument<'a, '_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -195,7 +196,7 @@ impl<'doc, D: DocumentTrait<'doc> + Debug> ObjectView for ParseableDocument<'doc } } -impl<'doc, D: DocumentTrait<'doc> + Debug> ValueView for ParseableDocument<'doc, D> { +impl<'a, D: DocumentTrait<'a> + Debug> ValueView for ParseableDocument<'a, '_, D> { fn as_debug(&self) -> &dyn Debug { self } diff --git a/crates/milli/src/prompt/fields.rs b/crates/milli/src/prompt/fields.rs index 8d006f0b7..5a842268c 100644 --- a/crates/milli/src/prompt/fields.rs +++ b/crates/milli/src/prompt/fields.rs @@ -121,10 +121,10 @@ impl ObjectView for FieldValue<'_, D> { pub struct OwnedFields<'a, D: ObjectView>(Vec>); #[derive(Debug)] -pub struct BorrowedFields<'a, 'map, D: ObjectView> { +pub struct BorrowedFields<'a, 'doc, 'map, D: ObjectView> { document: &'a D, field_id_map: &'a RefCell>, - doc_alloc: &'a Bump, + doc_alloc: &'doc Bump, } impl<'a, D: ObjectView> OwnedFields<'a, D> { @@ -138,11 +138,11 @@ impl<'a, D: ObjectView> OwnedFields<'a, D> { } } -impl<'a, 'map, D: ObjectView> BorrowedFields<'a, 'map, D> { +impl<'a, 'doc, 'map, D: ObjectView> BorrowedFields<'a, 'doc, 'map, D> { pub fn new( document: &'a D, field_id_map: &'a RefCell>, - doc_alloc: &'a Bump, + doc_alloc: &'doc Bump, ) -> Self { Self { document, field_id_map, doc_alloc } } @@ -170,7 +170,7 @@ impl ArrayView for OwnedFields<'_, D> { } } -impl ArrayView for BorrowedFields<'_, '_, D> { +impl ArrayView for BorrowedFields<'_, '_, '_, D> { fn as_value(&self) -> &dyn ValueView { self } @@ -212,7 +212,7 @@ impl ArrayView for BorrowedFields<'_, '_, D> { } } -impl ValueView for BorrowedFields<'_, '_, D> { +impl ValueView for BorrowedFields<'_, '_, '_, D> { fn as_debug(&self) -> &dyn std::fmt::Debug { self } @@ -288,11 +288,11 @@ impl ValueView for OwnedFields<'_, D> { } } -struct ArraySource<'a, 'map, D: ObjectView> { - s: &'a BorrowedFields<'a, 'map, D>, +struct ArraySource<'a, 'doc, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'doc, 'map, D>, } -impl fmt::Display for ArraySource<'_, '_, D> { +impl fmt::Display for ArraySource<'_, '_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "[")?; for item in self.s.values() { @@ -303,11 +303,11 @@ impl fmt::Display for ArraySource<'_, '_, D> { } } -struct ArrayRender<'a, 'map, D: ObjectView> { - s: &'a BorrowedFields<'a, 'map, D>, +struct ArrayRender<'a, 'doc, 'map, D: ObjectView> { + s: &'a BorrowedFields<'a, 'doc, 'map, D>, } -impl fmt::Display for ArrayRender<'_, '_, D> { +impl fmt::Display for ArrayRender<'_, '_, '_, D> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for item in self.s.values() { write!(f, "{}", item.render())?; diff --git a/crates/milli/src/prompt/mod.rs b/crates/milli/src/prompt/mod.rs index a8288f83d..03b20a090 100644 --- a/crates/milli/src/prompt/mod.rs +++ b/crates/milli/src/prompt/mod.rs @@ -9,12 +9,11 @@ use std::fmt::Debug; use std::num::NonZeroUsize; use bumpalo::Bump; -use document::ParseableDocument; +pub(crate) use document::{Document, ParseableDocument}; use error::{NewPromptError, RenderPromptError}; -use fields::{BorrowedFields, OwnedFields}; +pub use fields::{BorrowedFields, OwnedFields}; -use self::context::Context; -use self::document::Document; +pub use self::context::Context; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::update::del_add::DelAdd; use crate::GlobalFieldsIdsMap; @@ -108,8 +107,8 @@ impl Prompt { } pub fn render_document< - 'a, // lifetime of the borrow of the document - 'doc: 'a, // lifetime of the allocator, will live for an entire chunk of documents + 'a, // lifetime of the borrow of the document + 'doc, // lifetime of the allocator, will live for an entire chunk of documents >( &self, external_docid: &str, diff --git a/crates/milli/src/search/hybrid.rs b/crates/milli/src/search/hybrid.rs index b63f6288f..c906e1eb7 100644 --- a/crates/milli/src/search/hybrid.rs +++ b/crates/milli/src/search/hybrid.rs @@ -7,6 +7,7 @@ use roaring::RoaringBitmap; use crate::score_details::{ScoreDetails, ScoreValue, ScoringStrategy}; use crate::search::new::{distinct_fid, distinct_single_docid}; use crate::search::SemanticSearch; +use crate::vector::SearchQuery; use crate::{Index, MatchingWords, Result, Search, SearchResult}; struct ScoreWithRatioResult { @@ -225,12 +226,9 @@ impl Search<'_> { return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); } - // no vector search against placeholder search - let Some(query) = search.query.take() else { - return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); - }; // no embedder, no semantic search - let Some(SemanticSearch { vector, embedder_name, embedder, quantized }) = semantic else { + let Some(SemanticSearch { vector, embedder_name, embedder, quantized, media }) = semantic + else { return Ok(return_keyword_results(self.limit, self.offset, keyword_results)); }; @@ -241,9 +239,17 @@ impl Search<'_> { let span = tracing::trace_span!(target: "search::hybrid", "embed_one"); let _entered = span.enter(); + let q = search.query.as_deref(); + let media = media.as_ref(); + + let query = match (q, media) { + (Some(text), None) => SearchQuery::Text(text), + (q, media) => SearchQuery::Media { q, media }, + }; + let deadline = std::time::Instant::now() + std::time::Duration::from_secs(3); - match embedder.embed_search(&query, Some(deadline)) { + match embedder.embed_search(query, Some(deadline)) { Ok(embedding) => embedding, Err(error) => { tracing::error!(error=%error, "Embedding failed"); @@ -257,8 +263,13 @@ impl Search<'_> { } }; - search.semantic = - Some(SemanticSearch { vector: Some(vector_query), embedder_name, embedder, quantized }); + search.semantic = Some(SemanticSearch { + vector: Some(vector_query), + embedder_name, + embedder, + quantized, + media, + }); // TODO: would be better to have two distinct functions at this point let vector_results = search.execute()?; diff --git a/crates/milli/src/search/mod.rs b/crates/milli/src/search/mod.rs index 62183afc3..97d542524 100644 --- a/crates/milli/src/search/mod.rs +++ b/crates/milli/src/search/mod.rs @@ -12,7 +12,7 @@ use self::new::{execute_vector_search, PartialSearchResult, VectorStoreStats}; use crate::filterable_attributes_rules::{filtered_matching_patterns, matching_features}; use crate::index::MatchingStrategy; use crate::score_details::{ScoreDetails, ScoringStrategy}; -use crate::vector::Embedder; +use crate::vector::{Embedder, Embedding}; use crate::{ execute_search, filtered_universe, AscDesc, DefaultSearchLogger, DocumentId, Error, Index, Result, SearchContext, TimeBudget, UserError, @@ -32,6 +32,7 @@ pub mod similar; #[derive(Debug, Clone)] pub struct SemanticSearch { vector: Option>, + media: Option, embedder_name: String, embedder: Arc, quantized: bool, @@ -93,9 +94,10 @@ impl<'a> Search<'a> { embedder_name: String, embedder: Arc, quantized: bool, - vector: Option>, + vector: Option, + media: Option, ) -> &mut Search<'a> { - self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector }); + self.semantic = Some(SemanticSearch { embedder_name, embedder, quantized, vector, media }); self } @@ -231,24 +233,28 @@ impl<'a> Search<'a> { degraded, used_negative_operator, } = match self.semantic.as_ref() { - Some(SemanticSearch { vector: Some(vector), embedder_name, embedder, quantized }) => { - execute_vector_search( - &mut ctx, - vector, - self.scoring_strategy, - universe, - &self.sort_criteria, - &self.distinct, - self.geo_param, - self.offset, - self.limit, - embedder_name, - embedder, - *quantized, - self.time_budget.clone(), - self.ranking_score_threshold, - )? - } + Some(SemanticSearch { + vector: Some(vector), + embedder_name, + embedder, + quantized, + media: _, + }) => execute_vector_search( + &mut ctx, + vector, + self.scoring_strategy, + universe, + &self.sort_criteria, + &self.distinct, + self.geo_param, + self.offset, + self.limit, + embedder_name, + embedder, + *quantized, + self.time_budget.clone(), + self.ranking_score_threshold, + )?, _ => execute_search( &mut ctx, self.query.as_deref(), diff --git a/crates/milli/src/search/new/tests/integration.rs b/crates/milli/src/search/new/tests/integration.rs index 9e2afca97..38f39e18b 100644 --- a/crates/milli/src/search/new/tests/integration.rs +++ b/crates/milli/src/search/new/tests/integration.rs @@ -8,7 +8,7 @@ use maplit::{btreemap, hashset}; use crate::progress::Progress; use crate::update::new::indexer; use crate::update::{IndexerConfig, Settings}; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{db_snap, Criterion, FilterableAttributesRule, Index}; pub const CONTENT: &str = include_str!("../../../../tests/assets/test_set.ndjson"); use crate::constants::RESERVED_GEO_FIELD_NAME; @@ -55,7 +55,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/src/search/new/vector_sort.rs b/crates/milli/src/search/new/vector_sort.rs index 834f97384..2c201e899 100644 --- a/crates/milli/src/search/new/vector_sort.rs +++ b/crates/milli/src/search/new/vector_sort.rs @@ -32,8 +32,8 @@ impl VectorSort { ) -> Result { let embedder_index = ctx .index - .embedder_category_id - .get(ctx.txn, embedder_name)? + .embedding_configs() + .embedder_id(ctx.txn, embedder_name)? .ok_or_else(|| crate::UserError::InvalidSearchEmbedder(embedder_name.to_owned()))?; Ok(Self { diff --git a/crates/milli/src/search/similar.rs b/crates/milli/src/search/similar.rs index 759940f9c..903b5fcf9 100644 --- a/crates/milli/src/search/similar.rs +++ b/crates/milli/src/search/similar.rs @@ -64,10 +64,13 @@ impl<'a> Similar<'a> { let universe = universe; - let embedder_index = - self.index.embedder_category_id.get(self.rtxn, &self.embedder_name)?.ok_or_else( - || crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()), - )?; + let embedder_index = self + .index + .embedding_configs() + .embedder_id(self.rtxn, &self.embedder_name)? + .ok_or_else(|| { + crate::UserError::InvalidSimilarEmbedder(self.embedder_name.to_owned()) + })?; let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_index, self.quantized); let results = reader.nns_by_item( diff --git a/crates/milli/src/test_index.rs b/crates/milli/src/test_index.rs index f2e34c615..6bb6b1345 100644 --- a/crates/milli/src/test_index.rs +++ b/crates/milli/src/test_index.rs @@ -18,7 +18,7 @@ use crate::update::{ self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, Settings, }; use crate::vector::settings::{EmbedderSource, EmbeddingSettings}; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{db_snap, obkv_to_json, Filter, FilterableAttributesRule, Index, Search, SearchResult}; pub(crate) struct TempIndex { @@ -66,7 +66,7 @@ impl TempIndex { let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; + let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders; let mut indexer = indexer::DocumentOperation::new(); match self.index_documents_config.update_method { IndexDocumentsMethod::ReplaceDocuments => { @@ -151,7 +151,7 @@ impl TempIndex { let db_fields_ids_map = self.inner.fields_ids_map(&rtxn)?; let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.embedding_configs; + let embedders = InnerIndexSettings::from_index(&self.inner, &rtxn, None)?.runtime_embedders; let mut indexer = indexer::DocumentOperation::new(); let external_document_ids: Vec<_> = @@ -223,7 +223,7 @@ fn aborting_indexation() { let db_fields_ids_map = index.inner.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let payload = documents!([ { "id": 1, "name": "kevin" }, diff --git a/crates/milli/src/update/clear_documents.rs b/crates/milli/src/update/clear_documents.rs index b0ae070de..01631e9a3 100644 --- a/crates/milli/src/update/clear_documents.rs +++ b/crates/milli/src/update/clear_documents.rs @@ -64,11 +64,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> { self.index.delete_geo_faceted_documents_ids(self.wtxn)?; // Remove all user-provided bits from the configs - let mut configs = self.index.embedding_configs(self.wtxn)?; - for config in configs.iter_mut() { - config.user_provided.clear(); - } - self.index.put_embedding_configs(self.wtxn, configs)?; + self.index.embedding_configs().clear_embedder_info_docids(self.wtxn)?; // Clear the other databases. external_documents_ids.clear(self.wtxn)?; diff --git a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs index e1981a615..064cfd154 100644 --- a/crates/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/crates/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -1,4 +1,5 @@ use std::cmp::Ordering; +use std::collections::{BTreeMap, VecDeque}; use std::convert::{TryFrom, TryInto}; use std::fs::File; use std::io::{self, BufReader, BufWriter}; @@ -6,25 +7,29 @@ use std::mem::size_of; use std::str::from_utf8; use std::sync::Arc; +use bumpalo::Bump; use bytemuck::cast_slice; +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; use grenad::Writer; +use obkv::KvReaderU16; use ordered_float::OrderedFloat; -use roaring::RoaringBitmap; use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::error::FaultSource; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; -use crate::index::IndexEmbeddingConfig; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; +use crate::vector::extractor::{Extractor, ExtractorDiff, RequestFragmentExtractor}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState}; +use crate::vector::session::{EmbedSession, Metadata, OnEmbed}; use crate::vector::settings::ReindexAction; -use crate::vector::{Embedder, Embedding}; +use crate::vector::{Embedder, Embedding, RuntimeEmbedder, RuntimeFragment}; use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. @@ -37,12 +42,13 @@ pub struct ExtractedVectorPoints { pub remove_vectors: grenad::Reader>, // docid -> prompt pub prompts: grenad::Reader>, + // docid, extractor_id -> Option + pub inputs: grenad::Reader>, // embedder pub embedder_name: String, - pub embedder: Arc, - pub add_to_user_provided: RoaringBitmap, - pub remove_from_user_provided: RoaringBitmap, + pub runtime: Arc, + pub embedding_status_delta: EmbeddingStatusDelta, } enum VectorStateDelta { @@ -56,46 +62,74 @@ enum VectorStateDelta { // Remove any previous vector // Note: changing the value of the prompt **does require** recording this delta NowGenerated(String), + + // Add and remove the vectors computed from the fragments. + UpdateGeneratedFromFragments(Vec<(String, ExtractorDiff)>), + + /// Wasn't generated from fragments, but now is. + /// Delete any previous vectors and add the new vectors + NowGeneratedFromFragments(Vec<(String, Value)>), } impl VectorStateDelta { - fn into_values(self) -> (bool, String, Vec>) { + fn into_values(self) -> (bool, String, BTreeMap>, Vec>) { match self { VectorStateDelta::NoChange => Default::default(), - VectorStateDelta::NowRemoved => (true, Default::default(), Default::default()), - // We always delete the previous vectors - VectorStateDelta::NowManual(add) => (true, Default::default(), add), - VectorStateDelta::NowGenerated(prompt) => (true, prompt, Default::default()), + VectorStateDelta::NowRemoved => { + (true, Default::default(), Default::default(), Default::default()) + } + VectorStateDelta::NowManual(add) => (true, Default::default(), Default::default(), add), + VectorStateDelta::NowGenerated(prompt) => { + (true, prompt, Default::default(), Default::default()) + } + VectorStateDelta::UpdateGeneratedFromFragments(fragments) => ( + false, + Default::default(), + ExtractorDiff::into_list_of_changes(fragments), + Default::default(), + ), + VectorStateDelta::NowGeneratedFromFragments(items) => ( + true, + Default::default(), + ExtractorDiff::into_list_of_changes( + items.into_iter().map(|(name, value)| (name, ExtractorDiff::Added(value))), + ), + Default::default(), + ), } } } -struct EmbedderVectorExtractor { +struct EmbedderVectorExtractor<'a> { embedder_name: String, - embedder: Arc, - prompt: Arc, + embedder_info: &'a EmbedderInfo, + runtime: Arc, // (docid) -> (prompt) prompts_writer: Writer>, + // (docid, extractor_id) -> (Option) + inputs_writer: Writer>, // (docid) -> () remove_vectors_writer: Writer>, // (docid, _index) -> KvWriterDelAdd -> Vector manual_vectors_writer: Writer>, - // The docids of the documents that contains a user defined embedding - add_to_user_provided: RoaringBitmap, + embedding_status_delta: EmbeddingStatusDelta, action: ExtractionAction, } -struct DocumentOperation { - // The docids of the documents that contains an auto-generated embedding - remove_from_user_provided: RoaringBitmap, -} - enum ExtractionAction { SettingsFullReindex, - SettingsRegeneratePrompts { old_prompt: Arc }, - DocumentOperation(DocumentOperation), + SettingsRegeneratePrompts { + old_runtime: Arc, + }, + /// List of fragments to update/add + SettingsRegenerateFragments { + // name and indices, respectively in old and new runtime, of the fragments to examine. + must_regenerate_fragments: BTreeMap, usize)>, + old_runtime: Arc, + }, + DocumentOperation, } struct ManualEmbedderErrors { @@ -183,8 +217,8 @@ impl ManualEmbedderErrors { pub fn extract_vector_points( obkv_documents: grenad::Reader, indexer: GrenadParameters, - embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, + embedder_info: &[(String, EmbedderInfo)], possible_embedding_mistakes: &PossibleEmbeddingMistakes, ) -> Result<(Vec, UnusedVectorsDistribution)> { let mut unused_vectors_distribution = UnusedVectorsDistribution::new(); @@ -202,15 +236,15 @@ pub fn extract_vector_points( let mut extractors = Vec::new(); - let mut configs = settings_diff.new.embedding_configs.clone().into_inner(); - let old_configs = &settings_diff.old.embedding_configs; - + let mut configs = settings_diff.new.runtime_embedders.clone().into_inner(); + let old_configs = &settings_diff.old.runtime_embedders; if reindex_vectors { for (name, action) in settings_diff.embedding_config_updates.iter() { if let Some(action) = action.reindex() { - let Some((embedder_name, (embedder, prompt, _quantized))) = - configs.remove_entry(name) - else { + let (_, embedder_info) = + embedder_info.iter().find(|(embedder_name, _)| embedder_name == name).unwrap(); + + let Some((embedder_name, runtime)) = configs.remove_entry(name) else { tracing::error!(embedder = name, "Requested embedder config not found"); continue; }; @@ -229,6 +263,12 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let inputs_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + // (docid) -> () let remove_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -238,24 +278,68 @@ pub fn extract_vector_points( let action = match action { ReindexAction::FullReindex => ExtractionAction::SettingsFullReindex, - ReindexAction::RegeneratePrompts => { - let Some((_, old_prompt, _quantized)) = old_configs.get(name) else { + ReindexAction::RegenerateFragments(regenerate_fragments) => { + let Some(old_runtime) = old_configs.get(name) else { tracing::error!(embedder = name, "Old embedder config not found"); continue; }; - ExtractionAction::SettingsRegeneratePrompts { old_prompt } + let fragment_diffs = regenerate_fragments + .iter() + .filter_map(|(name, fragment)| match fragment { + crate::vector::settings::RegenerateFragment::Update => { + let old_value = old_runtime + .fragments() + .binary_search_by_key(&name, |fragment| &fragment.name) + .ok(); + let Ok(new_value) = runtime + .fragments() + .binary_search_by_key(&name, |fragment| &fragment.name) + else { + return None; + }; + Some((name.clone(), (old_value, new_value))) + } + // was already handled in transform + crate::vector::settings::RegenerateFragment::Remove => None, + crate::vector::settings::RegenerateFragment::Add => { + let Ok(new_value) = runtime + .fragments() + .binary_search_by_key(&name, |fragment| &fragment.name) + else { + return None; + }; + Some((name.clone(), (None, new_value))) + } + }) + .collect(); + ExtractionAction::SettingsRegenerateFragments { + old_runtime: old_runtime.clone(), + must_regenerate_fragments: fragment_diffs, + } + } + + ReindexAction::RegeneratePrompts => { + let Some(old_runtime) = old_configs.get(name) else { + tracing::error!(embedder = name, "Old embedder config not found"); + continue; + }; + + ExtractionAction::SettingsRegeneratePrompts { + old_runtime: old_runtime.clone(), + } } }; extractors.push(EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided: RoaringBitmap::new(), + embedding_status_delta: Default::default(), action, }); } else { @@ -264,8 +348,12 @@ pub fn extract_vector_points( } } else { // document operation + for (embedder_name, runtime) in configs.into_iter() { + let (_, embedder_info) = embedder_info + .iter() + .find(|(name, _)| embedder_name.as_str() == name.as_str()) + .unwrap(); - for (embedder_name, (embedder, prompt, _quantized)) in configs.into_iter() { // (docid, _index) -> KvWriterDelAdd -> Vector let manual_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -280,6 +368,12 @@ pub fn extract_vector_points( tempfile::tempfile()?, ); + let inputs_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + // (docid) -> () let remove_vectors_writer = create_writer( indexer.chunk_compression_type, @@ -289,22 +383,23 @@ pub fn extract_vector_points( extractors.push(EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided: RoaringBitmap::new(), - action: ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided: RoaringBitmap::new(), - }), + embedding_status_delta: Default::default(), + action: ExtractionAction::DocumentOperation, }); } } let mut key_buffer = Vec::new(); let mut cursor = obkv_documents.into_cursor()?; + let mut doc_alloc = Bump::new(); while let Some((key, value)) = cursor.move_on_next()? { + doc_alloc.reset(); // this must always be serialized as (docid, external_docid); const SIZE_OF_DOCUMENTID: usize = std::mem::size_of::(); let (docid_bytes, external_id_bytes) = @@ -320,9 +415,12 @@ pub fn extract_vector_points( // lazily get it when needed let document_id = || -> Value { from_utf8(external_id_bytes).unwrap().into() }; + let regenerate_for_embedders = embedder_info + .iter() + .filter(|&(_, infos)| infos.embedding_status.must_regenerate(docid)) + .map(|(name, _)| name.clone()); let mut parsed_vectors = ParsedVectorsDiff::new( - docid, - embedders_configs, + regenerate_for_embedders, obkv, old_vectors_fid, new_vectors_fid, @@ -331,44 +429,40 @@ pub fn extract_vector_points( for EmbedderVectorExtractor { embedder_name, - embedder, - prompt, + runtime, + embedder_info, prompts_writer, + inputs_writer, remove_vectors_writer, manual_vectors_writer, - add_to_user_provided, + embedding_status_delta, action, } in extractors.iter_mut() { - let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_)); + let embedder_is_manual = matches!(*runtime.embedder, Embedder::UserProvided(_)); let (old, new) = parsed_vectors.remove(embedder_name); + let new_must_regenerate = new.must_regenerate(); let delta = match action { ExtractionAction::SettingsFullReindex => match old { // A full reindex can be triggered either by: // 1. a new embedder // 2. an existing embedder changed so that it must regenerate all generated embeddings. // For a new embedder, there can be `_vectors.embedder` embeddings to add to the DB - VectorState::Inline(vectors) => { - if !vectors.must_regenerate() { - add_to_user_provided.insert(docid); - } - - match vectors.into_array_of_vectors() { - Some(add_vectors) => { - if add_vectors.len() > usize::from(u8::MAX) { - return Err(crate::Error::UserError( - crate::UserError::TooManyVectors( - document_id().to_string(), - add_vectors.len(), - ), - )); - } - VectorStateDelta::NowManual(add_vectors) + VectorState::Inline(vectors) => match vectors.into_array_of_vectors() { + Some(add_vectors) => { + if add_vectors.len() > usize::from(u8::MAX) { + return Err(crate::Error::UserError( + crate::UserError::TooManyVectors( + document_id().to_string(), + add_vectors.len(), + ), + )); } - None => VectorStateDelta::NoChange, + VectorStateDelta::NowManual(add_vectors) } - } + None => VectorStateDelta::NoChange, + }, // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors VectorState::Manual => VectorStateDelta::NoChange, // generated vectors must be regenerated @@ -381,11 +475,81 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, new_fields_ids_map)? + let has_fragments = !runtime.fragments().is_empty(); + + if has_fragments { + regenerate_all_fragments( + runtime.fragments(), + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } } }, + ExtractionAction::SettingsRegenerateFragments { + must_regenerate_fragments, + old_runtime, + } => { + if old.must_regenerate() { + let has_fragments = !runtime.fragments().is_empty(); + let old_has_fragments = !old_runtime.fragments().is_empty(); + + let is_adding_fragments = has_fragments && !old_has_fragments; + + if is_adding_fragments { + regenerate_all_fragments( + runtime.fragments(), + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else if !has_fragments { + // removing fragments + regenerate_prompt(obkv, &runtime.document_template, new_fields_ids_map)? + } else { + let mut fragment_diff = Vec::new(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let obkv_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + for (name, (old_index, new_index)) in must_regenerate_fragments { + let Some(new) = runtime.fragments().get(*new_index) else { + continue; + }; + + let new = + RequestFragmentExtractor::new(new, &doc_alloc).ignore_errors(); + + let diff = { + let old = old_index.as_ref().and_then(|old| { + let old = old_runtime.fragments().get(*old)?; + Some( + RequestFragmentExtractor::new(old, &doc_alloc) + .ignore_errors(), + ) + }); + let old = old.as_ref(); + Extractor::diff_settings(&new, &obkv_document, &(), old) + } + .expect("ignoring errors so this cannot fail"); + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) + } + } else { + // we can simply ignore user provided vectors as they are not regenerated and are + // already in the DB since this is an existing embedder + VectorStateDelta::NoChange + } + } // prompt regeneration is only triggered for existing embedders - ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { + ExtractionAction::SettingsRegeneratePrompts { old_runtime } => { if old.must_regenerate() { if embedder_is_manual { ManualEmbedderErrors::push_error( @@ -395,24 +559,32 @@ pub fn extract_vector_points( ); continue; } - regenerate_if_prompt_changed( - obkv, - (old_prompt, prompt), - (old_fields_ids_map, new_fields_ids_map), - )? + let has_fragments = !runtime.fragments().is_empty(); + + if has_fragments { + regenerate_all_fragments( + runtime.fragments(), + &doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + regenerate_if_prompt_changed( + obkv, + (&old_runtime.document_template, &runtime.document_template), + (old_fields_ids_map, new_fields_ids_map), + )? + } } else { // we can simply ignore user provided vectors as they are not regenerated and are // already in the DB since this is an existing embedder VectorStateDelta::NoChange } } - ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided, - }) => extract_vector_document_diff( - docid, + ExtractionAction::DocumentOperation => extract_vector_document_diff( obkv, - prompt, - (add_to_user_provided, remove_from_user_provided), + runtime, + &doc_alloc, (old, new), (old_fields_ids_map, new_fields_ids_map), document_id, @@ -421,13 +593,25 @@ pub fn extract_vector_points( &mut manual_errors, )?, }; + + // update the embedding status + push_embedding_status_delta( + embedding_status_delta, + docid, + &delta, + new_must_regenerate, + &embedder_info.embedding_status, + ); + // and we finally push the unique vectors into the writer push_vectors_diff( remove_vectors_writer, prompts_writer, + inputs_writer, manual_vectors_writer, &mut key_buffer, delta, + runtime.fragments(), )?; } @@ -444,45 +628,65 @@ pub fn extract_vector_points( for EmbedderVectorExtractor { embedder_name, - embedder, - prompt: _, + runtime, + embedder_info: _, prompts_writer, + inputs_writer, remove_vectors_writer, - action, + action: _, manual_vectors_writer, - add_to_user_provided, + embedding_status_delta, } in extractors { - let remove_from_user_provided = - if let ExtractionAction::DocumentOperation(DocumentOperation { - remove_from_user_provided, - }) = action - { - remove_from_user_provided - } else { - Default::default() - }; - results.push(ExtractedVectorPoints { manual_vectors: writer_into_reader(manual_vectors_writer)?, remove_vectors: writer_into_reader(remove_vectors_writer)?, prompts: writer_into_reader(prompts_writer)?, - embedder, + inputs: writer_into_reader(inputs_writer)?, + runtime, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, }) } Ok((results, unused_vectors_distribution)) } +fn push_embedding_status_delta( + embedding_status_delta: &mut EmbeddingStatusDelta, + docid: DocumentId, + delta: &VectorStateDelta, + new_must_regenerate: bool, + embedding_status: &EmbeddingStatus, +) { + let (old_is_user_provided, old_must_regenerate) = + embedding_status.is_user_provided_must_regenerate(docid); + let new_is_user_provided = match delta { + VectorStateDelta::NoChange => old_is_user_provided, + VectorStateDelta::NowRemoved => { + embedding_status_delta.clear_docid(docid, old_is_user_provided, old_must_regenerate); + return; + } + VectorStateDelta::NowManual(_) => true, + VectorStateDelta::NowGenerated(_) + | VectorStateDelta::UpdateGeneratedFromFragments(_) + | VectorStateDelta::NowGeneratedFromFragments(_) => false, + }; + + embedding_status_delta.push_delta( + docid, + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ); +} + #[allow(clippy::too_many_arguments)] // feel free to find efficient way to factor arguments fn extract_vector_document_diff( - docid: DocumentId, obkv: &obkv::KvReader, - prompt: &Prompt, - (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), + runtime: &RuntimeEmbedder, + doc_alloc: &Bump, (old, new): (VectorState, VectorState), (old_fields_ids_map, new_fields_ids_map): (&FieldIdMapWithMetadata, &FieldIdMapWithMetadata), document_id: impl Fn() -> Value, @@ -490,16 +694,6 @@ fn extract_vector_document_diff( embedder_is_manual: bool, manual_errors: &mut Option, ) -> Result { - match (old.must_regenerate(), new.must_regenerate()) { - (true, true) | (false, false) => {} - (true, false) => { - add_to_user_provided.insert(docid); - } - (false, true) => { - remove_from_user_provided.insert(docid); - } - } - let delta = match (old, new) { // regardless of the previous state, if a document now contains inline _vectors, they must // be extracted manually @@ -530,22 +724,55 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - // Don't give up if the old prompt was failing - let old_prompt = Some(&prompt).map(|p| { - p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) - .unwrap_or_default() - }); - let new_prompt = - prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; - if old_prompt.as_ref() != Some(&new_prompt) { - let old_prompt = old_prompt.unwrap_or_default(); - tracing::trace!( - "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + let has_fragments = !runtime.fragments().is_empty(); + if has_fragments { + let mut fragment_diff = Vec::new(); + let old_fields_ids_map = old_fields_ids_map.as_fields_ids_map(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let old_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Deletion, + old_fields_ids_map, ); - VectorStateDelta::NowGenerated(new_prompt) + + let new_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + + for new in runtime.fragments() { + let name = &new.name; + let fragment = + RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = fragment + .diff_documents(&old_document, &new_document, &()) + .expect("ignoring errors so this cannot fail"); + + fragment_diff.push((name.clone(), diff)); + } + VectorStateDelta::UpdateGeneratedFromFragments(fragment_diff) } else { - tracing::trace!("⏭️ Prompt unmodified, skipping"); - VectorStateDelta::NoChange + let prompt = &runtime.document_template; + // Don't give up if the old prompt was failing + let old_prompt = Some(&prompt).map(|p| { + p.render_kvdeladd(obkv, DelAdd::Deletion, old_fields_ids_map) + .unwrap_or_default() + }); + let new_prompt = + prompt.render_kvdeladd(obkv, DelAdd::Addition, new_fields_ids_map)?; + if old_prompt.as_ref() != Some(&new_prompt) { + let old_prompt = old_prompt.unwrap_or_default(); + tracing::trace!( + "🚀 Changing prompt from\n{old_prompt}\n===to===\n{new_prompt}" + ); + VectorStateDelta::NowGenerated(new_prompt) + } else { + tracing::trace!("⏭️ Prompt unmodified, skipping"); + VectorStateDelta::NoChange + } } } else { VectorStateDelta::NowRemoved @@ -567,15 +794,25 @@ fn extract_vector_document_diff( ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); return Ok(VectorStateDelta::NoChange); } - // becomes autogenerated - VectorStateDelta::NowGenerated(prompt.render_kvdeladd( - obkv, - DelAdd::Addition, - new_fields_ids_map, - )?) + + let has_fragments = !runtime.fragments().is_empty(); + + if has_fragments { + regenerate_all_fragments( + runtime.fragments(), + doc_alloc, + new_fields_ids_map, + obkv, + ) + } else { + // becomes autogenerated + VectorStateDelta::NowGenerated(runtime.document_template.render_kvdeladd( + obkv, + DelAdd::Addition, + new_fields_ids_map, + )?) + } } else { - // make sure the document is always removed from user provided on removal - remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } @@ -593,8 +830,6 @@ fn extract_vector_document_diff( // then they are user-provided and nothing possibly changed VectorStateDelta::NoChange } else { - // make sure the document is always removed from user provided on removal - remove_from_user_provided.insert(docid); VectorStateDelta::NowRemoved } } @@ -629,16 +864,44 @@ fn regenerate_prompt( Ok(VectorStateDelta::NowGenerated(prompt)) } +fn regenerate_all_fragments<'a>( + fragments: impl IntoIterator, + doc_alloc: &Bump, + new_fields_ids_map: &FieldIdMapWithMetadata, + obkv: &KvReaderU16, +) -> VectorStateDelta { + let mut fragment_diff = Vec::new(); + let new_fields_ids_map = new_fields_ids_map.as_fields_ids_map(); + + let obkv_document = crate::update::new::document::KvDelAddDocument::new( + obkv, + DelAdd::Addition, + new_fields_ids_map, + ); + for new in fragments { + let name = &new.name; + let new = RequestFragmentExtractor::new(new, doc_alloc).ignore_errors(); + + let diff = new.extract(&obkv_document, &()).expect("ignoring errors so this cannot fail"); + if let Some(value) = diff { + fragment_diff.push((name.clone(), value)); + } + } + VectorStateDelta::NowGeneratedFromFragments(fragment_diff) +} + /// We cannot compute the diff between both Del and Add vectors. /// We'll push every vector and compute the difference later in TypedChunk. fn push_vectors_diff( remove_vectors_writer: &mut Writer>, prompts_writer: &mut Writer>, + inputs_writer: &mut Writer>, manual_vectors_writer: &mut Writer>, key_buffer: &mut Vec, delta: VectorStateDelta, + fragments: &[RuntimeFragment], ) -> Result<()> { - let (must_remove, prompt, mut add_vectors) = delta.into_values(); + let (must_remove, prompt, mut fragment_delta, mut add_vectors) = delta.into_values(); if must_remove { key_buffer.truncate(TRUNCATE_SIZE); remove_vectors_writer.insert(&key_buffer, [])?; @@ -648,23 +911,49 @@ fn push_vectors_diff( prompts_writer.insert(&key_buffer, prompt.as_bytes())?; } - // We sort and dedup the vectors - add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); - add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + if !fragment_delta.is_empty() { + let mut scratch = Vec::new(); + let mut fragment_delta: Vec<_> = fragments + .iter() + .filter_map(|fragment| { + let delta = fragment_delta.remove(&fragment.name)?; + Some((fragment.id, delta)) + }) + .collect(); - // insert vectors into the writer - for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { - // Generate the key by extending the unique index to it. - key_buffer.truncate(TRUNCATE_SIZE); - let index = u16::try_from(i).unwrap(); - key_buffer.extend_from_slice(&index.to_be_bytes()); + fragment_delta.sort_unstable_by_key(|(id, _)| *id); + for (id, value) in fragment_delta { + key_buffer.truncate(TRUNCATE_SIZE); + key_buffer.push(id); + if let Some(value) = value { + scratch.clear(); + serde_json::to_writer(&mut scratch, &value).unwrap(); + inputs_writer.insert(&key_buffer, &scratch)?; + } else { + inputs_writer.insert(&key_buffer, [])?; + } + } + } - // We insert only the Add part of the Obkv to inform - // that we only want to remove all those vectors. - let mut obkv = KvWriterDelAdd::memory(); - obkv.insert(DelAdd::Addition, cast_slice(&vector))?; - let bytes = obkv.into_inner()?; - manual_vectors_writer.insert(&key_buffer, bytes)?; + if !add_vectors.is_empty() { + // We sort and dedup the vectors + add_vectors.sort_unstable_by(|a, b| compare_vectors(a, b)); + add_vectors.dedup_by(|a, b| compare_vectors(a, b).is_eq()); + + // insert vectors into the writer + for (i, vector) in add_vectors.into_iter().enumerate().take(u16::MAX as usize) { + // Generate the key by extending the unique index to it. + key_buffer.truncate(TRUNCATE_SIZE); + let index = u16::try_from(i).unwrap(); + key_buffer.extend_from_slice(&index.to_be_bytes()); + + // We insert only the Add part of the Obkv to inform + // that we only want to remove all those vectors. + let mut obkv = KvWriterDelAdd::memory(); + obkv.insert(DelAdd::Addition, cast_slice(&vector))?; + let bytes = obkv.into_inner()?; + manual_vectors_writer.insert(&key_buffer, bytes)?; + } } Ok(()) @@ -677,17 +966,18 @@ fn compare_vectors(a: &[f32], b: &[f32]) -> Ordering { #[allow(clippy::too_many_arguments)] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] -pub fn extract_embeddings( +pub fn extract_embeddings_from_prompts( // docid, prompt prompt_reader: grenad::Reader, indexer: GrenadParameters, - embedder: Arc, + runtime: Arc, embedder_name: &str, possible_embedding_mistakes: &PossibleEmbeddingMistakes, embedder_stats: &EmbedderStats, unused_vectors_distribution: &UnusedVectorsDistribution, request_threads: &ThreadPoolNoAbort, ) -> Result>> { + let embedder = &runtime.embedder; let n_chunks = embedder.chunk_count_hint(); // chunk level parallelism let n_vectors_per_chunk = embedder.prompt_count_in_chunk_hint(); // number of vectors in a single chunk @@ -723,7 +1013,7 @@ pub fn extract_embeddings( if chunks.len() == chunks.capacity() { let chunked_embeds = embed_chunks( - &embedder, + embedder, std::mem::replace(&mut chunks, Vec::with_capacity(n_chunks)), embedder_name, possible_embedding_mistakes, @@ -746,7 +1036,7 @@ pub fn extract_embeddings( // send last chunk if !chunks.is_empty() { let chunked_embeds = embed_chunks( - &embedder, + embedder, std::mem::take(&mut chunks), embedder_name, possible_embedding_mistakes, @@ -765,7 +1055,7 @@ pub fn extract_embeddings( if !current_chunk.is_empty() { let embeds = embed_chunks( - &embedder, + embedder, vec![std::mem::take(&mut current_chunk)], embedder_name, possible_embedding_mistakes, @@ -838,3 +1128,175 @@ fn embed_chunks( } } } + +#[allow(clippy::too_many_arguments)] +#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract")] +pub fn extract_embeddings_from_fragments( + // (docid, extractor_id) -> (Option) + inputs_reader: grenad::Reader, + indexer: GrenadParameters, + runtime: Arc, + embedder_name: &str, + possible_embedding_mistakes: &PossibleEmbeddingMistakes, + embedder_stats: &EmbedderStats, + unused_vectors_distribution: &UnusedVectorsDistribution, + request_threads: &ThreadPoolNoAbort, +) -> Result>> { + let doc_alloc = Bump::new(); + + // (docid, extractor_id) -> (Option) + let vector_writer = create_writer( + indexer.chunk_compression_type, + indexer.chunk_compression_level, + tempfile::tempfile()?, + ); + + if inputs_reader.is_empty() { + return writer_into_reader(vector_writer); + } + + let on_embed = WriteGrenadOnEmbed { + waiting_responses: Default::default(), + vector_writer, + scratch: Default::default(), + possible_embedding_mistakes, + }; + + let mut session = EmbedSession::new( + &runtime.embedder, + embedder_name, + request_threads, + &doc_alloc, + embedder_stats, + on_embed, + ); + + let mut cursor = inputs_reader.into_cursor()?; + + while let Some((mut key, value)) = cursor.move_on_next()? { + let docid = key.read_u32::().unwrap(); + let extractor_id = key.read_u8().unwrap(); + + if value.is_empty() { + // no value => removed fragment + session.on_embed_mut().push_response(docid, extractor_id); + } else { + // unwrap: the grenad value was saved as a serde_json::Value + let value: Value = serde_json::from_slice(value).unwrap(); + session.request_embedding( + Metadata { docid, external_docid: "", extractor_id }, + value, + unused_vectors_distribution, + )?; + } + } + + // send last chunk + let on_embed = session.drain(unused_vectors_distribution)?; + on_embed.finish() +} + +struct WriteGrenadOnEmbed<'a> { + // list of (document_id, extractor_id) for which vectors should be removed. + // these are written whenever a response arrives that has a larger (docid, extractor_id). + waiting_responses: VecDeque<(DocumentId, u8)>, + + // grenad of (docid, extractor_id) -> (Option) + vector_writer: Writer>, + + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + + // scratch buffer used to write keys + scratch: Vec, +} + +impl WriteGrenadOnEmbed<'_> { + pub fn push_response(&mut self, docid: DocumentId, extractor_id: u8) { + self.waiting_responses.push_back((docid, extractor_id)); + } + + pub fn finish(mut self) -> Result>> { + for (docid, extractor_id) in self.waiting_responses { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, []).unwrap(); + } + writer_into_reader(self.vector_writer) + } +} + +impl<'doc> OnEmbed<'doc> for WriteGrenadOnEmbed<'_> { + type ErrorMetadata = UnusedVectorsDistribution; + fn process_embedding_response( + &mut self, + response: crate::vector::session::EmbeddingResponse<'doc>, + ) { + let (docid, extractor_id) = (response.metadata.docid, response.metadata.extractor_id); + while let Some(waiting_response) = self.waiting_responses.pop_front() { + if (docid, extractor_id) > waiting_response { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, []).unwrap(); + } else { + self.waiting_responses.push_front(waiting_response); + break; + } + } + + if let Some(embedding) = response.embedding { + self.scratch.clear(); + self.scratch.write_u32::(docid).unwrap(); + self.scratch.write_u8(extractor_id).unwrap(); + self.vector_writer.insert(&self.scratch, cast_slice(embedding.as_slice())).unwrap(); + } + } + + fn process_embedding_error( + &mut self, + error: crate::vector::error::EmbedError, + embedder_name: &'doc str, + unused_vectors_distribution: &crate::vector::error::UnusedVectorsDistribution, + _metadata: bumpalo::collections::Vec<'doc, crate::vector::session::Metadata<'doc>>, + ) -> crate::Error { + if let FaultSource::Bug = error.fault { + crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) + } else { + let mut msg = + format!(r"While embedding documents for embedder `{embedder_name}`: {error}"); + + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + } + + let mut hint_count = 0; + + for (vector_misspelling, count) in + self.possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in self + .possible_embedding_mistakes + .embedder_mistakes(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } + + if hint_count == 0 { + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" + ); + } + } + + crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) + } + } +} diff --git a/crates/milli/src/update/index_documents/extract/mod.rs b/crates/milli/src/update/index_documents/extract/mod.rs index d640bc075..b41fd59e1 100644 --- a/crates/milli/src/update/index_documents/extract/mod.rs +++ b/crates/milli/src/update/index_documents/extract/mod.rs @@ -23,16 +23,17 @@ use self::extract_fid_docid_facet_values::{extract_fid_docid_facet_values, Extra use self::extract_fid_word_count_docids::extract_fid_word_count_docids; use self::extract_geo_points::extract_geo_points; use self::extract_vector_points::{ - extract_embeddings, extract_vector_points, ExtractedVectorPoints, + extract_embeddings_from_prompts, extract_vector_points, ExtractedVectorPoints, }; use self::extract_word_docids::extract_word_docids; use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids; use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{as_cloneable_grenad, CursorClonableMmap, GrenadParameters}; use super::{helpers, TypedChunk}; -use crate::index::IndexEmbeddingConfig; use crate::progress::EmbedderStats; +use crate::update::index_documents::extract::extract_vector_points::extract_embeddings_from_fragments; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::EmbedderInfo; use crate::vector::error::PossibleEmbeddingMistakes; use crate::{FieldId, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; @@ -46,9 +47,9 @@ pub(crate) fn data_from_obkv_documents( indexer: GrenadParameters, lmdb_writer_sx: Sender>, primary_key_id: FieldId, - embedders_configs: Arc>, settings_diff: Arc, max_positions_per_attributes: Option, + embedder_info: Arc>, possible_embedding_mistakes: Arc, embedder_stats: &Arc, ) -> Result<()> { @@ -61,8 +62,8 @@ pub(crate) fn data_from_obkv_documents( original_documents_chunk, indexer, lmdb_writer_sx.clone(), - embedders_configs.clone(), settings_diff.clone(), + embedder_info.clone(), possible_embedding_mistakes.clone(), embedder_stats.clone(), ) @@ -231,8 +232,8 @@ fn send_original_documents_data( original_documents_chunk: Result>>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, - embedders_configs: Arc>, settings_diff: Arc, + embedder_info: Arc>, possible_embedding_mistakes: Arc, embedder_stats: Arc, ) -> Result<()> { @@ -241,11 +242,10 @@ fn send_original_documents_data( let index_vectors = (settings_diff.reindex_vectors() || !settings_diff.settings_update_only()) // no point in indexing vectors without embedders - && (!settings_diff.new.embedding_configs.inner_as_ref().is_empty()); + && (!settings_diff.new.runtime_embedders.inner_as_ref().is_empty()); if index_vectors { let settings_diff = settings_diff.clone(); - let embedders_configs = embedders_configs.clone(); let original_documents_chunk = original_documents_chunk.clone(); let lmdb_writer_sx = lmdb_writer_sx.clone(); @@ -253,8 +253,8 @@ fn send_original_documents_data( match extract_vector_points( original_documents_chunk.clone(), indexer, - &embedders_configs, &settings_diff, + embedder_info.as_slice(), &possible_embedding_mistakes, ) { Ok((extracted_vectors, unused_vectors_distribution)) => { @@ -262,16 +262,16 @@ fn send_original_documents_data( manual_vectors, remove_vectors, prompts, + inputs, embedder_name, - embedder, - add_to_user_provided, - remove_from_user_provided, + runtime, + embedding_status_delta, } in extracted_vectors { - let embeddings = match extract_embeddings( + let embeddings_from_prompts = match extract_embeddings_from_prompts( prompts, indexer, - embedder.clone(), + runtime.clone(), &embedder_name, &possible_embedding_mistakes, &embedder_stats, @@ -284,18 +284,37 @@ fn send_original_documents_data( None } }; + + let embeddings_from_fragments = match extract_embeddings_from_fragments( + inputs, + indexer, + runtime.clone(), + &embedder_name, + &possible_embedding_mistakes, + &embedder_stats, + &unused_vectors_distribution, + request_threads(), + ) { + Ok(results) => Some(results), + Err(error) => { + let _ = lmdb_writer_sx.send(Err(error)); + None + } + }; + if !(remove_vectors.is_empty() && manual_vectors.is_empty() - && embeddings.as_ref().is_none_or(|e| e.is_empty())) + && embeddings_from_prompts.as_ref().is_none_or(|e| e.is_empty()) + && embeddings_from_fragments.as_ref().is_none_or(|e| e.is_empty())) { let _ = lmdb_writer_sx.send(Ok(TypedChunk::VectorPoints { remove_vectors, - embeddings, - expected_dimension: embedder.dimensions(), + embeddings_from_prompts, + embeddings_from_fragments, + expected_dimension: runtime.embedder.dimensions(), manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, })); } } diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 5ec6910f7..658ff1923 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -38,7 +38,8 @@ pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; -use crate::vector::{ArroyWrapper, EmbeddingConfigs}; +use crate::vector::db::EmbedderInfo; +use crate::vector::{ArroyWrapper, RuntimeEmbedders}; use crate::{CboRoaringBitmapCodec, Index, Result, UserError}; static MERGED_DATABASE_COUNT: usize = 7; @@ -81,7 +82,7 @@ pub struct IndexDocuments<'t, 'i, 'a, FP, FA> { should_abort: FA, added_documents: u64, deleted_documents: u64, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, embedder_stats: &'t Arc, } @@ -172,7 +173,7 @@ where Ok((self, Ok(indexed_documents))) } - pub fn with_embedders(mut self, embedders: EmbeddingConfigs) -> Self { + pub fn with_embedders(mut self, embedders: RuntimeEmbedders) -> Self { self.embedders = embedders; self } @@ -226,7 +227,13 @@ where settings_diff.new.recompute_searchables(self.wtxn, self.index)?; let settings_diff = Arc::new(settings_diff); - let embedders_configs = Arc::new(self.index.embedding_configs(self.wtxn)?); + let embedder_infos: heed::Result> = self + .index + .embedding_configs() + .iter_embedder_info(self.wtxn)? + .map(|res| res.map(|(name, info)| (name.to_owned(), info))) + .collect(); + let embedder_infos = Arc::new(embedder_infos?); let possible_embedding_mistakes = crate::vector::error::PossibleEmbeddingMistakes::new(&field_distribution); @@ -328,9 +335,9 @@ where pool_params, lmdb_writer_sx.clone(), primary_key_id, - embedders_configs.clone(), settings_diff_cloned, max_positions_per_attributes, + embedder_infos, Arc::new(possible_embedding_mistakes), &embedder_stats ) @@ -430,21 +437,21 @@ where TypedChunk::VectorPoints { expected_dimension, remove_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, } => { dimension.insert(embedder_name.clone(), expected_dimension); TypedChunk::VectorPoints { remove_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, expected_dimension, manual_vectors, embedder_name, - add_to_user_provided, - remove_from_user_provided, + embedding_status_delta, } } otherwise => otherwise, @@ -480,7 +487,7 @@ where // we should insert it in `dimension` for (name, action) in settings_diff.embedding_config_updates.iter() { if action.is_being_quantized && !dimension.contains_key(name.as_str()) { - let index = self.index.embedder_category_id.get(self.wtxn, name)?.ok_or( + let index = self.index.embedding_configs().embedder_id(self.wtxn, name)?.ok_or( InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, @@ -488,7 +495,9 @@ where )?; let reader = ArroyWrapper::new(self.index.vector_arroy, index, action.was_quantized); - let dim = reader.dimensions(self.wtxn)?; + let Some(dim) = reader.dimensions(self.wtxn)? else { + continue; + }; dimension.insert(name.to_string(), dim); } } @@ -498,12 +507,19 @@ where let vector_arroy = self.index.vector_arroy; let cancel = &self.should_abort; - let embedder_index = self.index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, - )?; + let embedder_index = + self.index.embedding_configs().embedder_id(wtxn, &embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?; let embedder_config = settings_diff.embedding_config_updates.get(&embedder_name); - let was_quantized = - settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); + let was_quantized = settings_diff + .old + .runtime_embedders + .get(&embedder_name) + .is_some_and(|conf| conf.is_quantized); let is_quantizing = embedder_config.is_some_and(|action| action.is_being_quantized); pool.install(|| { @@ -773,11 +789,11 @@ mod tests { use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::documents::mmap_from_objects; use crate::index::tests::TempIndex; - use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::search::TermsMatchingStrategy; use crate::update::new::indexer; use crate::update::Setting; + use crate::vector::db::IndexEmbeddingConfig; use crate::{all_obkv_to_json, db_snap, Filter, FilterableAttributesRule, Search, UserError}; #[test] @@ -2028,7 +2044,7 @@ mod tests { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -2116,7 +2132,7 @@ mod tests { new_fields_ids_map, primary_key, &document_changes, - EmbeddingConfigs::default(), + RuntimeEmbedders::default(), &|| false, &Progress::default(), &Default::default(), @@ -2277,7 +2293,7 @@ mod tests { ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.replace_documents(&documents).unwrap(); indexer.delete_documents(&["2"]); @@ -2343,7 +2359,7 @@ mod tests { indexer.delete_documents(&["1", "2"]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let (document_changes, _operation_stats, primary_key) = indexer .into_changes( &indexer_alloc, @@ -2394,7 +2410,7 @@ mod tests { { "id": 3, "name": "jean", "age": 25 }, ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.update_documents(&documents).unwrap(); @@ -2446,7 +2462,7 @@ mod tests { { "id": 3, "legs": 4 }, ]); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.update_documents(&documents).unwrap(); indexer.delete_documents(&["1", "2"]); @@ -2496,7 +2512,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2"]); @@ -2552,7 +2568,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1", "2", "1", "2"]); @@ -2611,7 +2627,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ @@ -2661,7 +2677,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); @@ -2775,6 +2791,8 @@ mod tests { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, distribution: Setting::NotSet, @@ -2801,17 +2819,27 @@ mod tests { .unwrap(); let rtxn = index.read_txn().unwrap(); - let mut embedding_configs = index.embedding_configs(&rtxn).unwrap(); - let IndexEmbeddingConfig { name: embedder_name, config: embedder, user_provided } = + let embedders = index.embedding_configs(); + let mut embedding_configs = embedders.embedding_configs(&rtxn).unwrap(); + let IndexEmbeddingConfig { name: embedder_name, config: embedder, fragments } = embedding_configs.pop().unwrap(); + let info = embedders.embedder_info(&rtxn, &embedder_name).unwrap().unwrap(); + insta::assert_snapshot!(info.embedder_id, @"0"); + insta::assert_debug_snapshot!(info.embedding_status.user_provided_docids(), @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(info.embedding_status.skip_regenerate_docids(), @"RoaringBitmap<[0, 1, 2]>"); insta::assert_snapshot!(embedder_name, @"manual"); - insta::assert_debug_snapshot!(user_provided, @"RoaringBitmap<[0, 1, 2]>"); + insta::assert_debug_snapshot!(fragments, @r###" + FragmentConfigs( + [], + ) + "###); + let embedder = std::sync::Arc::new( crate::vector::Embedder::new(embedder.embedder_options, 0).unwrap(), ); let res = index .search(&rtxn) - .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec())) + .semantic(embedder_name, embedder, false, Some([0.0, 1.0, 2.0].to_vec()), None) .execute() .unwrap(); assert_eq!(res.documents_ids.len(), 3); @@ -2860,7 +2888,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); // OP @@ -2921,7 +2949,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.delete_documents(&["1"]); @@ -2980,7 +3008,7 @@ mod tests { let mut new_fields_ids_map = db_fields_ids_map.clone(); let indexer_alloc = Bump::new(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let documents = documents!([ diff --git a/crates/milli/src/update/index_documents/transform.rs b/crates/milli/src/update/index_documents/transform.rs index e17625ad4..e07483aff 100644 --- a/crates/milli/src/update/index_documents/transform.rs +++ b/crates/milli/src/update/index_documents/transform.rs @@ -31,7 +31,7 @@ use crate::update::index_documents::GrenadParameters; use crate::update::settings::{InnerIndexSettings, InnerIndexSettingsDiff}; use crate::update::{AvailableIds, UpdateIndexingStep}; use crate::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors}; -use crate::vector::settings::WriteBackToDocuments; +use crate::vector::settings::{RemoveFragments, WriteBackToDocuments}; use crate::vector::ArroyWrapper; use crate::{FieldDistribution, FieldId, FieldIdMapMissingEntry, Index, Result}; @@ -933,10 +933,47 @@ impl<'a, 'i> Transform<'a, 'i> { // delete all vectors from the embedders that need removal for (_, (reader, _)) in readers { - let dimensions = reader.dimensions(wtxn)?; + let Some(dimensions) = reader.dimensions(wtxn)? else { + continue; + }; reader.clear(wtxn, dimensions)?; } + // remove all vectors for the specified fragments + for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in + settings_diff.embedding_config_updates.iter().filter_map(|(name, action)| { + action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized)) + }) + { + let Some(infos) = self.index.embedding_configs().embedder_info(wtxn, embedder_name)? + else { + continue; + }; + let arroy = + ArroyWrapper::new(self.index.vector_arroy, infos.embedder_id, was_quantized); + let Some(dimensions) = arroy.dimensions(wtxn)? else { + continue; + }; + for fragment_id in fragment_ids { + // we must keep the user provided embeddings that ended up in this store + + if infos.embedding_status.user_provided_docids().is_empty() { + // no user provided: clear store + arroy.clear_store(wtxn, *fragment_id, dimensions)?; + continue; + } + + // some user provided, remove only the ids that are not user provided + let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { + items - infos.embedding_status.user_provided_docids() + })?; + + for to_delete in to_delete { + arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; + } + } + } + let grenad_params = GrenadParameters { chunk_compression_type: self.indexer_settings.chunk_compression_type, chunk_compression_level: self.indexer_settings.chunk_compression_level, diff --git a/crates/milli/src/update/index_documents/typed_chunk.rs b/crates/milli/src/update/index_documents/typed_chunk.rs index 6d575a98b..c93e3e0f7 100644 --- a/crates/milli/src/update/index_documents/typed_chunk.rs +++ b/crates/milli/src/update/index_documents/typed_chunk.rs @@ -4,6 +4,7 @@ use std::fs::File; use std::io::{self, BufReader}; use bytemuck::allocation::pod_collect_to_vec; +use byteorder::{BigEndian, ReadBytesExt as _}; use grenad::{MergeFunction, Merger, MergerBuilder}; use heed::types::Bytes; use heed::{BytesDecode, RwTxn}; @@ -18,7 +19,6 @@ use super::helpers::{ use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind}; use crate::facet::FacetType; use crate::index::db_name::DOCUMENTS; -use crate::index::IndexEmbeddingConfig; use crate::proximity::MAX_DISTANCE; use crate::update::del_add::{deladd_serialize_add_side, DelAdd, KvReaderDelAdd}; use crate::update::facet::FacetsUpdate; @@ -26,6 +26,7 @@ use crate::update::index_documents::helpers::{ as_cloneable_grenad, try_split_array_at, KeepLatestObkv, }; use crate::update::settings::InnerIndexSettingsDiff; +use crate::vector::db::{EmbeddingStatusDelta, IndexEmbeddingConfig}; use crate::vector::ArroyWrapper; use crate::{ lat_lng_to_xyz, CboRoaringBitmapCodec, DocumentId, FieldId, GeoPoint, Index, InternalError, @@ -86,12 +87,14 @@ pub(crate) enum TypedChunk { GeoPoints(grenad::Reader>), VectorPoints { remove_vectors: grenad::Reader>, - embeddings: Option>>, + // docid -> vector + embeddings_from_prompts: Option>>, + // docid, extractor_id -> Option, + embeddings_from_fragments: Option>>, expected_dimension: usize, manual_vectors: grenad::Reader>, embedder_name: String, - add_to_user_provided: RoaringBitmap, - remove_from_user_provided: RoaringBitmap, + embedding_status_delta: EmbeddingStatusDelta, }, } @@ -155,6 +158,7 @@ pub(crate) fn write_typed_chunk_into_index( let mut iter = merger.into_stream_merger_iter()?; let embedders: BTreeSet<_> = index + .embedding_configs() .embedding_configs(wtxn)? .into_iter() .map(|IndexEmbeddingConfig { name, .. }| name) @@ -614,57 +618,66 @@ pub(crate) fn write_typed_chunk_into_index( let span = tracing::trace_span!(target: "indexing::write_db", "vector_points"); let _entered = span.enter(); + let embedders = index.embedding_configs(); + let mut remove_vectors_builder = MergerBuilder::new(KeepFirst); let mut manual_vectors_builder = MergerBuilder::new(KeepFirst); - let mut embeddings_builder = MergerBuilder::new(KeepFirst); - let mut add_to_user_provided = RoaringBitmap::new(); - let mut remove_from_user_provided = RoaringBitmap::new(); + let mut embeddings_from_prompts_builder = MergerBuilder::new(KeepFirst); + let mut embeddings_from_fragments_builder = MergerBuilder::new(KeepFirst); let mut params = None; + let mut infos = None; for typed_chunk in typed_chunks { let TypedChunk::VectorPoints { remove_vectors, manual_vectors, - embeddings, + embeddings_from_prompts, + embeddings_from_fragments, expected_dimension, embedder_name, - add_to_user_provided: aud, - remove_from_user_provided: rud, + embedding_status_delta, } = typed_chunk else { unreachable!(); }; + if infos.is_none() { + infos = Some(embedders.embedder_info(wtxn, &embedder_name)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + }, + )?); + } + params = Some((expected_dimension, embedder_name)); remove_vectors_builder.push(remove_vectors.into_cursor()?); manual_vectors_builder.push(manual_vectors.into_cursor()?); - if let Some(embeddings) = embeddings { - embeddings_builder.push(embeddings.into_cursor()?); + if let Some(embeddings) = embeddings_from_prompts { + embeddings_from_prompts_builder.push(embeddings.into_cursor()?); + } + if let Some(embeddings) = embeddings_from_fragments { + embeddings_from_fragments_builder.push(embeddings.into_cursor()?); + } + + if let Some(infos) = &mut infos { + embedding_status_delta.apply_to(&mut infos.embedding_status); } - add_to_user_provided |= aud; - remove_from_user_provided |= rud; } // typed chunks has always at least 1 chunk. let Some((expected_dimension, embedder_name)) = params else { unreachable!() }; + let Some(infos) = infos else { unreachable!() }; - let mut embedding_configs = index.embedding_configs(wtxn)?; - let index_embedder_config = embedding_configs - .iter_mut() - .find(|IndexEmbeddingConfig { name, .. }| name == &embedder_name) - .unwrap(); - index_embedder_config.user_provided -= remove_from_user_provided; - index_embedder_config.user_provided |= add_to_user_provided; + embedders.put_embedder_info(wtxn, &embedder_name, &infos)?; - index.put_embedding_configs(wtxn, embedding_configs)?; - - let embedder_index = index.embedder_category_id.get(wtxn, &embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None }, - )?; - let binary_quantized = - settings_diff.old.embedding_configs.get(&embedder_name).is_some_and(|conf| conf.2); + let binary_quantized = settings_diff + .old + .runtime_embedders + .get(&embedder_name) + .is_some_and(|conf| conf.is_quantized); // FIXME: allow customizing distance - let writer = ArroyWrapper::new(index.vector_arroy, embedder_index, binary_quantized); + let writer = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, binary_quantized); // remove vectors for docids we want them removed let merger = remove_vectors_builder.build(); @@ -674,8 +687,8 @@ pub(crate) fn write_typed_chunk_into_index( writer.del_items(wtxn, expected_dimension, docid)?; } - // add generated embeddings - let merger = embeddings_builder.build(); + // add generated embeddings -- from prompts + let merger = embeddings_from_prompts_builder.build(); let mut iter = merger.into_stream_merger_iter()?; while let Some((key, value)) = iter.next()? { let docid = key.try_into().map(DocumentId::from_be_bytes).unwrap(); @@ -702,6 +715,24 @@ pub(crate) fn write_typed_chunk_into_index( writer.add_items(wtxn, docid, &embeddings)?; } + // add generated embeddings -- from fragments + let merger = embeddings_from_fragments_builder.build(); + let mut iter = merger.into_stream_merger_iter()?; + while let Some((mut key, value)) = iter.next()? { + let docid = key.read_u32::().unwrap(); + let extractor_id = key.read_u8().unwrap(); + if value.is_empty() { + writer.del_item_in_store(wtxn, docid, extractor_id, expected_dimension)?; + } else { + let data = pod_collect_to_vec(value); + // it is a code error to have embeddings and not expected_dimension + if data.len() != expected_dimension { + panic!("wrong dimensions") + } + writer.add_item_in_store(wtxn, docid, extractor_id, &data)?; + } + } + // perform the manual diff let merger = manual_vectors_builder.build(); let mut iter = merger.into_stream_merger_iter()?; diff --git a/crates/milli/src/update/new/channel.rs b/crates/milli/src/update/new/channel.rs index 4fff31a35..aec192ace 100644 --- a/crates/milli/src/update/new/channel.rs +++ b/crates/milli/src/update/new/channel.rs @@ -138,6 +138,7 @@ pub enum ReceiverAction { WakeUp, LargeEntry(LargeEntry), LargeVectors(LargeVectors), + LargeVector(LargeVector), } /// An entry that cannot fit in the BBQueue buffers has been @@ -174,6 +175,24 @@ impl LargeVectors { } } +#[derive(Debug)] +pub struct LargeVector { + /// The document id associated to the large embedding. + pub docid: DocumentId, + /// The embedder id in which to insert the large embedding. + pub embedder_id: u8, + /// The extractor id in which to insert the large embedding. + pub extractor_id: u8, + /// The large embedding that must be written. + pub embedding: Mmap, +} + +impl LargeVector { + pub fn read_embedding(&self, dimensions: usize) -> &[f32] { + self.embedding.chunks_exact(dimensions).map(bytemuck::cast_slice).next().unwrap() + } +} + impl<'a> WriterBbqueueReceiver<'a> { /// Tries to receive an action to do until the timeout occurs /// and if it does, consider it as a spurious wake up. @@ -238,6 +257,7 @@ pub enum EntryHeader { DbOperation(DbOperation), ArroyDeleteVector(ArroyDeleteVector), ArroySetVectors(ArroySetVectors), + ArroySetVector(ArroySetVector), } impl EntryHeader { @@ -250,6 +270,7 @@ impl EntryHeader { EntryHeader::DbOperation(_) => 0, EntryHeader::ArroyDeleteVector(_) => 1, EntryHeader::ArroySetVectors(_) => 2, + EntryHeader::ArroySetVector(_) => 3, } } @@ -274,11 +295,17 @@ impl EntryHeader { Self::variant_size() + mem::size_of::() + embedding_size * count } + fn total_set_vector_size(dimensions: usize) -> usize { + let embedding_size = dimensions * mem::size_of::(); + Self::variant_size() + mem::size_of::() + embedding_size + } + fn header_size(&self) -> usize { let payload_size = match self { EntryHeader::DbOperation(op) => mem::size_of_val(op), EntryHeader::ArroyDeleteVector(adv) => mem::size_of_val(adv), EntryHeader::ArroySetVectors(asvs) => mem::size_of_val(asvs), + EntryHeader::ArroySetVector(asv) => mem::size_of_val(asv), }; Self::variant_size() + payload_size } @@ -301,6 +328,11 @@ impl EntryHeader { let header = checked::pod_read_unaligned(header_bytes); EntryHeader::ArroySetVectors(header) } + 3 => { + let header_bytes = &remaining[..mem::size_of::()]; + let header = checked::pod_read_unaligned(header_bytes); + EntryHeader::ArroySetVector(header) + } id => panic!("invalid variant id: {id}"), } } @@ -311,6 +343,7 @@ impl EntryHeader { EntryHeader::DbOperation(op) => bytemuck::bytes_of(op), EntryHeader::ArroyDeleteVector(adv) => bytemuck::bytes_of(adv), EntryHeader::ArroySetVectors(asvs) => bytemuck::bytes_of(asvs), + EntryHeader::ArroySetVector(asv) => bytemuck::bytes_of(asv), }; *first = self.variant_id(); remaining.copy_from_slice(payload_bytes); @@ -379,6 +412,37 @@ impl ArroySetVectors { } } +#[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] +#[repr(C)] +/// The embeddings are in the remaining space and represents +/// non-aligned [f32] each with dimensions f32s. +pub struct ArroySetVector { + pub docid: DocumentId, + pub embedder_id: u8, + pub extractor_id: u8, + _padding: [u8; 2], +} + +impl ArroySetVector { + fn embeddings_bytes<'a>(frame: &'a FrameGrantR<'_>) -> &'a [u8] { + let skip = EntryHeader::variant_size() + mem::size_of::(); + &frame[skip..] + } + + /// Read the embedding and write it into an aligned `f32` Vec. + pub fn read_all_embeddings_into_vec<'v>( + &self, + frame: &FrameGrantR<'_>, + vec: &'v mut Vec, + ) -> &'v [f32] { + let embeddings_bytes = Self::embeddings_bytes(frame); + let embeddings_count = embeddings_bytes.len() / mem::size_of::(); + vec.resize(embeddings_count, 0.0); + bytemuck::cast_slice_mut(vec.as_mut()).copy_from_slice(embeddings_bytes); + &vec[..] + } +} + #[derive(Debug, Clone, Copy, NoUninit, CheckedBitPattern)] #[repr(u16)] pub enum Database { @@ -398,6 +462,7 @@ pub enum Database { FacetIdStringDocids, FieldIdDocidFacetStrings, FieldIdDocidFacetF64s, + VectorEmbedderCategoryId, } impl Database { @@ -419,6 +484,7 @@ impl Database { Database::FacetIdStringDocids => index.facet_id_string_docids.remap_types(), Database::FieldIdDocidFacetStrings => index.field_id_docid_facet_strings.remap_types(), Database::FieldIdDocidFacetF64s => index.field_id_docid_facet_f64s.remap_types(), + Database::VectorEmbedderCategoryId => index.embedder_category_id.remap_types(), } } @@ -440,6 +506,7 @@ impl Database { Database::FacetIdStringDocids => db_name::FACET_ID_STRING_DOCIDS, Database::FieldIdDocidFacetStrings => db_name::FIELD_ID_DOCID_FACET_STRINGS, Database::FieldIdDocidFacetF64s => db_name::FIELD_ID_DOCID_FACET_F64S, + Database::VectorEmbedderCategoryId => db_name::VECTOR_EMBEDDER_CATEGORY_ID, } } } @@ -568,6 +635,82 @@ impl<'b> ExtractorBbqueueSender<'b> { Ok(()) } + fn set_vector_for_extractor( + &self, + docid: u32, + embedder_id: u8, + extractor_id: u8, + embedding: Option, + ) -> crate::Result<()> { + let max_grant = self.max_grant; + let refcell = self.producers.get().unwrap(); + let mut producer = refcell.0.borrow_mut_or_yield(); + + // If there are no vectors we specify the dimensions + // to zero to allocate no extra space at all + let dimensions = embedding.as_ref().map_or(0, |emb| emb.len()); + + let arroy_set_vector = + ArroySetVector { docid, embedder_id, extractor_id, _padding: [0; 2] }; + let payload_header = EntryHeader::ArroySetVector(arroy_set_vector); + let total_length = EntryHeader::total_set_vector_size(dimensions); + if total_length > max_grant { + let mut value_file = tempfile::tempfile().map(BufWriter::new)?; + let embedding = embedding.expect("set_vector without a vector does not fit in RAM"); + + let mut embedding_bytes = bytemuck::cast_slice(&embedding); + io::copy(&mut embedding_bytes, &mut value_file)?; + + let value_file = value_file.into_inner().map_err(|ie| ie.into_error())?; + let embedding = unsafe { Mmap::map(&value_file)? }; + + let large_vectors = LargeVector { docid, embedder_id, extractor_id, embedding }; + self.sender.send(ReceiverAction::LargeVector(large_vectors)).unwrap(); + + return Ok(()); + } + + // Spin loop to have a frame the size we requested. + reserve_and_write_grant( + &mut producer, + total_length, + &self.sender, + &self.sent_messages_attempts, + &self.blocking_sent_messages_attempts, + |grant| { + let header_size = payload_header.header_size(); + let (header_bytes, remaining) = grant.split_at_mut(header_size); + payload_header.serialize_into(header_bytes); + + if dimensions != 0 { + let output_iter = + remaining.chunks_exact_mut(dimensions * mem::size_of::()); + + for (embedding, output) in embedding.iter().zip(output_iter) { + output.copy_from_slice(bytemuck::cast_slice(embedding)); + } + } + + Ok(()) + }, + )?; + + Ok(()) + } + + fn embedding_status( + &self, + name: &str, + infos: crate::vector::db::EmbedderInfo, + ) -> crate::Result<()> { + let bytes = infos.to_bytes().map_err(|_| { + InternalError::Serialization(crate::SerializationError::Encoding { + db_name: Some(Database::VectorEmbedderCategoryId.database_name()), + }) + })?; + self.write_key_value(Database::VectorEmbedderCategoryId, name.as_bytes(), &bytes) + } + fn write_key_value(&self, database: Database, key: &[u8], value: &[u8]) -> crate::Result<()> { let key_length = key.len().try_into().ok().and_then(NonZeroU16::new).ok_or_else(|| { InternalError::StorePut { @@ -942,9 +1085,18 @@ impl EmbeddingSender<'_, '_> { &self, docid: DocumentId, embedder_id: u8, - embedding: Embedding, + extractor_id: u8, + embedding: Option, ) -> crate::Result<()> { - self.0.set_vectors(docid, embedder_id, &[embedding]) + self.0.set_vector_for_extractor(docid, embedder_id, extractor_id, embedding) + } + + pub(crate) fn embedding_status( + &self, + name: &str, + infos: crate::vector::db::EmbedderInfo, + ) -> crate::Result<()> { + self.0.embedding_status(name, infos) } } diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index b07cc0298..d520bb952 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -12,6 +12,7 @@ use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; use crate::constants::{RESERVED_GEO_FIELD_NAME, RESERVED_VECTORS_FIELD_NAME}; use crate::documents::FieldIdMapper; +use crate::update::del_add::KvReaderDelAdd; use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal}; use crate::update::new::vector_document::VectorDocumentFromDb; use crate::vector::settings::EmbedderAction; @@ -469,6 +470,110 @@ impl<'doc> Versions<'doc> { } } +#[derive(Debug)] +pub struct KvDelAddDocument<'a, Mapper: FieldIdMapper> { + document: &'a obkv::KvReaderU16, + side: crate::update::del_add::DelAdd, + fields_ids_map: &'a Mapper, +} + +impl<'a, Mapper: FieldIdMapper> KvDelAddDocument<'a, Mapper> { + pub fn new( + document: &'a obkv::KvReaderU16, + side: crate::update::del_add::DelAdd, + fields_ids_map: &'a Mapper, + ) -> Self { + Self { document, side, fields_ids_map } + } + + fn get(&self, k: &str) -> Result> { + let Some(id) = self.fields_ids_map.id(k) else { return Ok(None) }; + let Some(value) = self.document.get(id) else { return Ok(None) }; + let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { return Ok(None) }; + + let value = serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + Ok(Some(value)) + } +} + +impl<'a, Mapper: FieldIdMapper> Document<'a> for KvDelAddDocument<'a, Mapper> { + fn iter_top_level_fields(&self) -> impl Iterator> { + let mut it = self.document.iter(); + + std::iter::from_fn(move || loop { + let (fid, value) = it.next()?; + let Some(value) = KvReaderDelAdd::from_slice(value).get(self.side) else { + continue; + }; + let name = match self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + ) { + Ok(name) => name, + Err(error) => return Some(Err(error.into())), + }; + + if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME { + continue; + } + + let res = (|| { + let value = + serde_json::from_slice(value).map_err(crate::InternalError::SerdeJson)?; + + Ok((name, value)) + })(); + + return Some(res); + }) + } + + fn top_level_fields_count(&self) -> usize { + let mut it = self.document.iter(); + + std::iter::from_fn(move || loop { + let (fid, value) = it.next()?; + let Some(_) = KvReaderDelAdd::from_slice(value).get(self.side) else { + continue; + }; + let name = match self.fields_ids_map.name(fid).ok_or( + InternalError::FieldIdMapMissingEntry(crate::FieldIdMapMissingEntry::FieldId { + field_id: fid, + process: "getting current document", + }), + ) { + Ok(name) => name, + Err(_) => return Some(()), + }; + + if name == RESERVED_VECTORS_FIELD_NAME || name == RESERVED_GEO_FIELD_NAME { + continue; + } + + return Some(()); + }) + .count() + } + + fn top_level_field(&self, k: &str) -> Result> { + if k == RESERVED_VECTORS_FIELD_NAME || k == RESERVED_GEO_FIELD_NAME { + return Ok(None); + } + self.get(k) + } + + fn vectors_field(&self) -> Result> { + self.get(RESERVED_VECTORS_FIELD_NAME) + } + + fn geo_field(&self) -> Result> { + self.get(RESERVED_GEO_FIELD_NAME) + } +} + pub struct DocumentIdentifiers<'doc> { docid: DocumentId, external_document_id: &'doc str, diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index 2b9161319..1a40615e7 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -11,7 +11,7 @@ use super::vector_document::{ use crate::attribute_patterns::PatternMatch; use crate::documents::FieldIdMapper; use crate::update::new::document::DocumentIdentifiers; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::{DocumentId, Index, InternalError, Result}; pub enum DocumentChange<'doc> { @@ -70,7 +70,7 @@ impl<'doc> Insertion<'doc> { pub fn inserted_vectors( &self, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } @@ -241,7 +241,7 @@ impl<'doc> Update<'doc> { pub fn only_changed_vectors( &self, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { VectorDocumentFromVersions::new(self.external_document_id, &self.new, doc_alloc, embedders) } @@ -252,7 +252,7 @@ impl<'doc> Update<'doc> { index: &'doc Index, mapper: &'doc Mapper, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result>> { if self.from_scratch { MergedVectorDocument::without_db( diff --git a/crates/milli/src/update/new/extract/documents.rs b/crates/milli/src/update/new/extract/documents.rs index 5c1a1927a..31d2ada0f 100644 --- a/crates/milli/src/update/new/extract/documents.rs +++ b/crates/milli/src/update/new/extract/documents.rs @@ -7,8 +7,7 @@ use hashbrown::HashMap; use super::DelAddRoaringBitmap; use crate::constants::RESERVED_GEO_FIELD_NAME; use crate::update::new::channel::{DocumentsSender, ExtractorBbqueueSender}; -use crate::update::new::document::{write_to_obkv, Document}; -use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::document::{write_to_obkv, Document, DocumentContext, DocumentIdentifiers}; use crate::update::new::indexer::document_changes::{Extractor, IndexingContext}; use crate::update::new::indexer::settings_changes::{ settings_change_extract, DocumentsIndentifiers, SettingsChangeExtractor, @@ -19,16 +18,16 @@ use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; use crate::update::settings::SettingsDelta; use crate::vector::settings::EmbedderAction; -use crate::vector::EmbeddingConfigs; +use crate::vector::RuntimeEmbedders; use crate::Result; pub struct DocumentsExtractor<'a, 'b> { document_sender: DocumentsSender<'a, 'b>, - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, } impl<'a, 'b> DocumentsExtractor<'a, 'b> { - pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a EmbeddingConfigs) -> Self { + pub fn new(document_sender: DocumentsSender<'a, 'b>, embedders: &'a RuntimeEmbedders) -> Self { Self { document_sender, embedders } } } diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 4d308018a..4ca68027c 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -1,30 +1,36 @@ use std::cell::RefCell; -use std::collections::BTreeMap; +use std::fmt::Debug; use bumpalo::collections::Vec as BVec; use bumpalo::Bump; use hashbrown::{DefaultHashBuilder, HashMap}; -use super::cache::DelAddRoaringBitmap; use crate::error::FaultSource; use crate::progress::EmbedderStats; use crate::prompt::Prompt; use crate::update::new::channel::EmbeddingSender; -use crate::update::new::document::{DocumentContext, DocumentIdentifiers}; +use crate::update::new::document::{Document, DocumentContext, DocumentIdentifiers}; use crate::update::new::indexer::document_changes::Extractor; use crate::update::new::indexer::settings_changes::SettingsChangeExtractor; use crate::update::new::thread_local::MostlySend; use crate::update::new::vector_document::VectorDocument; use crate::update::new::DocumentChange; +use crate::update::settings::SettingsDelta; +use crate::vector::db::{EmbedderInfo, EmbeddingStatus, EmbeddingStatusDelta}; use crate::vector::error::{ EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistributionBump, }; -use crate::vector::settings::{EmbedderAction, ReindexAction}; -use crate::vector::{Embedder, Embedding, EmbeddingConfigs}; +use crate::vector::extractor::{ + DocumentTemplateExtractor, Extractor as VectorExtractor, ExtractorDiff, + RequestFragmentExtractor, +}; +use crate::vector::session::{EmbedSession, Input, Metadata, OnEmbed}; +use crate::vector::settings::ReindexAction; +use crate::vector::{Embedding, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment}; use crate::{DocumentId, FieldDistribution, InternalError, Result, ThreadPoolNoAbort, UserError}; pub struct EmbeddingExtractor<'a, 'b> { - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, embedder_stats: &'a EmbedderStats, @@ -33,7 +39,7 @@ pub struct EmbeddingExtractor<'a, 'b> { impl<'a, 'b> EmbeddingExtractor<'a, 'b> { pub fn new( - embedders: &'a EmbeddingConfigs, + embedders: &'a RuntimeEmbedders, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, embedder_stats: &'a EmbedderStats, @@ -45,7 +51,7 @@ impl<'a, 'b> EmbeddingExtractor<'a, 'b> { } pub struct EmbeddingExtractorData<'extractor>( - pub HashMap, + pub HashMap, ); unsafe impl MostlySend for EmbeddingExtractorData<'_> {} @@ -67,19 +73,18 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); - for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { - let embedder_id = - context.index.embedder_category_id.get(&context.rtxn, embedder_name)?.ok_or_else( - || InternalError::DatabaseMissingEntry { - db_name: "embedder_category_id", - key: None, - }, - )?; + let embedder_db = context.index.embedding_configs(); + for (embedder_name, runtime) in embedders { + let embedder_info = embedder_db + .embedder_info(&context.rtxn, embedder_name)? + .ok_or_else(|| InternalError::DatabaseMissingEntry { + db_name: "embedder_category_id", + key: None, + })?; all_chunks.push(Chunks::new( - embedder, - embedder_id, + runtime, + embedder_info, embedder_name, - prompt, context.data, &self.possible_embedding_mistakes, self.embedder_stats, @@ -94,19 +99,14 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { match change { DocumentChange::Deletion(deletion) => { // vector deletion is handled by document sender, - // we still need to accomodate deletion from user_provided + // we still need to accomodate deletion from embedding_status for chunks in &mut all_chunks { - // regenerate: true means we delete from user_provided - chunks.set_regenerate(deletion.docid(), true); + let (is_user_provided, must_regenerate) = + chunks.is_user_provided_must_regenerate(deletion.docid()); + chunks.clear_status(deletion.docid(), is_user_provided, must_regenerate); } } DocumentChange::Update(update) => { - let old_vectors = update.current_vectors( - &context.rtxn, - context.index, - context.db_fields_ids_map, - &context.doc_alloc, - )?; let new_vectors = update.only_changed_vectors(&context.doc_alloc, self.embedders)?; @@ -115,19 +115,16 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } for chunks in &mut all_chunks { - let embedder_name = chunks.embedder_name(); - let prompt = chunks.prompt(); + let (old_is_user_provided, old_must_regenerate) = + chunks.is_user_provided_must_regenerate(update.docid()); - let old_vectors = old_vectors.vectors_for_key(embedder_name)?.unwrap(); + let embedder_name = chunks.embedder_name(); // case where we have a `_vectors` field in the updated document if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { let new_vectors = new_vectors?; - if old_vectors.regenerate != new_vectors.regenerate { - chunks.set_regenerate(update.docid(), new_vectors.regenerate); - } // do we have set embeddings? if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( @@ -139,97 +136,62 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { document_id: update.external_document_id().to_string(), error: error.to_string(), })?, + old_is_user_provided, + old_must_regenerate, + new_vectors.regenerate, )?; // regenerate if the new `_vectors` fields is set to. } else if new_vectors.regenerate { - let new_rendered = prompt.render_document( - update.external_document_id(), - update.merged( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - let must_regenerate = if !old_vectors.regenerate { - // we just enabled `regenerate` - true - } else { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - ); - - if let Ok(old_rendered) = old_rendered { - // must regenerate if the rendered changed - new_rendered != old_rendered - } else { - // cannot check previous rendered, better regenerate - true - } - }; - - if must_regenerate { - chunks.set_autogenerated( - update.docid(), - update.external_document_id(), - new_rendered, - &unused_vectors_distribution, - )?; - } - } - // no `_vectors` field, so only regenerate if the document is already set to in the DB. - } else if old_vectors.regenerate { - let new_rendered = prompt.render_document( - update.external_document_id(), - update.merged( + let new_document = update.merged( &context.rtxn, context.index, context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - - let must_regenerate = { - let old_rendered = prompt.render_document( - update.external_document_id(), - update.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - ); - if let Ok(old_rendered) = old_rendered { - // regenerate if the rendered version changed - new_rendered != old_rendered - } else { - // if we cannot render the previous version of the documents, let's regenerate - true - } - }; - - if must_regenerate { - chunks.set_autogenerated( + )?; + let old_document = update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + chunks.update_autogenerated( update.docid(), update.external_document_id(), - new_rendered, + old_document, + new_document, + context.new_fields_ids_map, &unused_vectors_distribution, + old_is_user_provided, + old_must_regenerate, + true, )?; } + // no `_vectors` field, so only regenerate if the document is already set to in the DB. + } else if old_must_regenerate { + let new_document = update.merged( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + let old_document = update.current( + &context.rtxn, + context.index, + context.db_fields_ids_map, + )?; + chunks.update_autogenerated( + update.docid(), + update.external_document_id(), + old_document, + new_document, + context.new_fields_ids_map, + &unused_vectors_distribution, + old_is_user_provided, + old_must_regenerate, + true, + )?; } } } DocumentChange::Insertion(insertion) => { + let (default_is_user_provided, default_must_regenerate) = (false, true); let new_vectors = insertion.inserted_vectors(&context.doc_alloc, self.embedders)?; if let Some(new_vectors) = &new_vectors { @@ -238,13 +200,11 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { for chunks in &mut all_chunks { let embedder_name = chunks.embedder_name(); - let prompt = chunks.prompt(); // if no inserted vectors, then regenerate: true + no embeddings => autogenerate if let Some(new_vectors) = new_vectors.as_ref().and_then(|new_vectors| { new_vectors.vectors_for_key(embedder_name).transpose() }) { let new_vectors = new_vectors?; - chunks.set_regenerate(insertion.docid(), new_vectors.regenerate); if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( insertion.external_document_id(), @@ -257,33 +217,36 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { .to_string(), error: error.to_string(), })?, + default_is_user_provided, + default_must_regenerate, + new_vectors.regenerate, )?; } else if new_vectors.regenerate { - let rendered = prompt.render_document( + chunks.insert_autogenerated( + insertion.docid(), insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - insertion.docid(), - insertion.external_document_id(), - rendered, &unused_vectors_distribution, + true, )?; + } else { + chunks.set_status( + insertion.docid(), + default_is_user_provided, + default_must_regenerate, + false, + false, + ); } } else { - let rendered = prompt.render_document( + chunks.insert_autogenerated( + insertion.docid(), insertion.external_document_id(), insertion.inserted(), context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - insertion.docid(), - insertion.external_document_id(), - rendered, &unused_vectors_distribution, + true, )?; } } @@ -298,44 +261,31 @@ impl<'extractor> Extractor<'extractor> for EmbeddingExtractor<'_, '_> { } } -pub struct SettingsChangeEmbeddingExtractor<'a, 'b> { - embedders: &'a EmbeddingConfigs, - old_embedders: &'a EmbeddingConfigs, - embedder_actions: &'a BTreeMap, - embedder_category_id: &'a std::collections::HashMap, +pub struct SettingsChangeEmbeddingExtractor<'a, 'b, SD> { + settings_delta: &'a SD, embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, possible_embedding_mistakes: PossibleEmbeddingMistakes, threads: &'a ThreadPoolNoAbort, } -impl<'a, 'b> SettingsChangeEmbeddingExtractor<'a, 'b> { +impl<'a, 'b, SD: SettingsDelta> SettingsChangeEmbeddingExtractor<'a, 'b, SD> { #[allow(clippy::too_many_arguments)] pub fn new( - embedders: &'a EmbeddingConfigs, - old_embedders: &'a EmbeddingConfigs, - embedder_actions: &'a BTreeMap, - embedder_category_id: &'a std::collections::HashMap, + settings_delta: &'a SD, embedder_stats: &'a EmbedderStats, sender: EmbeddingSender<'a, 'b>, field_distribution: &'a FieldDistribution, threads: &'a ThreadPoolNoAbort, ) -> Self { let possible_embedding_mistakes = PossibleEmbeddingMistakes::new(field_distribution); - Self { - embedders, - old_embedders, - embedder_actions, - embedder_category_id, - embedder_stats, - sender, - threads, - possible_embedding_mistakes, - } + Self { settings_delta, embedder_stats, sender, threads, possible_embedding_mistakes } } } -impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbeddingExtractor<'_, '_> { +impl<'extractor, SD: SettingsDelta + Sync> SettingsChangeExtractor<'extractor> + for SettingsChangeEmbeddingExtractor<'_, '_, SD> +{ type Data = RefCell>; fn init_data<'doc>(&'doc self, extractor_alloc: &'extractor Bump) -> crate::Result { @@ -347,44 +297,49 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding documents: impl Iterator>>, context: &'doc DocumentContext, ) -> crate::Result<()> { - let embedders = self.embedders.inner_as_ref(); - let old_embedders = self.old_embedders.inner_as_ref(); + let embedders = self.settings_delta.new_embedders(); + let old_embedders = self.settings_delta.old_embedders(); let unused_vectors_distribution = UnusedVectorsDistributionBump::new_in(&context.doc_alloc); let mut all_chunks = BVec::with_capacity_in(embedders.len(), &context.doc_alloc); - for (embedder_name, (embedder, prompt, _is_quantized)) in embedders { - // if the embedder is not in the embedder_actions, we don't need to reindex. - if let Some((embedder_id, reindex_action)) = - self.embedder_actions - .get(embedder_name) - // keep only the reindex actions - .and_then(EmbedderAction::reindex) - // map the reindex action to the embedder_id - .map(|reindex| { - let embedder_id = self.embedder_category_id.get(embedder_name).expect( - "An embedder_category_id must exist for all reindexed embedders", - ); - (*embedder_id, reindex) - }) - { - all_chunks.push(( - Chunks::new( - embedder, - embedder_id, - embedder_name, - prompt, - context.data, - &self.possible_embedding_mistakes, - self.embedder_stats, - self.threads, - self.sender, - &context.doc_alloc, - ), - reindex_action, - )) - } + let embedder_configs = context.index.embedding_configs(); + for (embedder_name, action) in self.settings_delta.embedder_actions().iter() { + let Some(reindex_action) = action.reindex() else { + continue; + }; + let runtime = embedders + .get(embedder_name) + .expect("A runtime must exist for all reindexed embedder"); + let embedder_info = embedder_configs + .embedder_info(&context.rtxn, embedder_name)? + .unwrap_or_else(|| { + // new embedder + EmbedderInfo { + embedder_id: *self + .settings_delta + .new_embedder_category_id() + .get(embedder_name) + .expect( + "An embedder_category_id must exist for all reindexed embedders", + ), + embedding_status: EmbeddingStatus::new(), + } + }); + all_chunks.push(( + Chunks::new( + runtime, + embedder_info, + embedder_name.as_str(), + context.data, + &self.possible_embedding_mistakes, + self.embedder_stats, + self.threads, + self.sender, + &context.doc_alloc, + ), + reindex_action, + )); } - for document in documents { let document = document?; @@ -398,6 +353,16 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding for (chunks, reindex_action) in &mut all_chunks { let embedder_name = chunks.embedder_name(); let current_vectors = current_vectors.vectors_for_key(embedder_name)?; + let (old_is_user_provided, _) = + chunks.is_user_provided_must_regenerate(document.docid()); + let old_has_fragments = old_embedders + .get(embedder_name) + .map(|embedder| !embedder.fragments().is_empty()) + .unwrap_or_default(); + + let new_has_fragments = chunks.has_fragments(); + + let fragments_changed = old_has_fragments ^ new_has_fragments; // if the vectors for this document have been already provided, we don't need to reindex. let (is_new_embedder, must_regenerate) = @@ -406,60 +371,33 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding }); match reindex_action { - ReindexAction::RegeneratePrompts => { + ReindexAction::RegeneratePrompts | ReindexAction::RegenerateFragments(_) => { if !must_regenerate { continue; } // we need to regenerate the prompts for the document - - // Get the old prompt and render the document with it - let Some((_, old_prompt, _)) = old_embedders.get(embedder_name) else { - unreachable!("ReindexAction::RegeneratePrompts implies that the embedder {embedder_name} is in the old_embedders") - }; - let old_rendered = old_prompt.render_document( + chunks.settings_change_autogenerated( + document.docid(), document.external_document_id(), document.current( &context.rtxn, context.index, context.db_fields_ids_map, )?, + self.settings_delta, context.new_fields_ids_map, - &context.doc_alloc, + &unused_vectors_distribution, + old_is_user_provided, + fragments_changed, )?; - - // Get the new prompt and render the document with it - let new_prompt = chunks.prompt(); - let new_rendered = new_prompt.render_document( - document.external_document_id(), - document.current( - &context.rtxn, - context.index, - context.db_fields_ids_map, - )?, - context.new_fields_ids_map, - &context.doc_alloc, - )?; - - // Compare the rendered documents - // if they are different, regenerate the vectors - if new_rendered != old_rendered { - chunks.set_autogenerated( - document.docid(), - document.external_document_id(), - new_rendered, - &unused_vectors_distribution, - )?; - } } ReindexAction::FullReindex => { - let prompt = chunks.prompt(); // if no inserted vectors, then regenerate: true + no embeddings => autogenerate if let Some(embeddings) = current_vectors .and_then(|vectors| vectors.embeddings) // insert the embeddings only for new embedders .filter(|_| is_new_embedder) { - chunks.set_regenerate(document.docid(), must_regenerate); chunks.set_vectors( document.external_document_id(), document.docid(), @@ -469,24 +407,27 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding error: error.to_string(), }, )?, + old_is_user_provided, + true, + must_regenerate, )?; } else if must_regenerate { - let rendered = prompt.render_document( + chunks.settings_change_autogenerated( + document.docid(), document.external_document_id(), document.current( &context.rtxn, context.index, context.db_fields_ids_map, )?, + self.settings_delta, context.new_fields_ids_map, - &context.doc_alloc, - )?; - chunks.set_autogenerated( - document.docid(), - document.external_document_id(), - rendered, &unused_vectors_distribution, + old_is_user_provided, + true, )?; + } else if is_new_embedder { + chunks.set_status(document.docid(), false, true, false, false); } } } @@ -501,156 +442,73 @@ impl<'extractor> SettingsChangeExtractor<'extractor> for SettingsChangeEmbedding } } -// **Warning**: the destructor of this struct is not normally run, make sure that all its fields: -// 1. don't have side effects tied to they destructors -// 2. if allocated, are allocated inside of the bumpalo -// -// Currently this is the case as: -// 1. BVec are inside of the bumaplo -// 2. All other fields are either trivial (u8) or references. -struct Chunks<'a, 'b, 'extractor> { - texts: BVec<'a, &'a str>, - ids: BVec<'a, DocumentId>, - - embedder: &'a Embedder, +pub struct OnEmbeddingDocumentUpdates<'doc, 'b> { embedder_id: u8, - embedder_name: &'a str, - dimensions: usize, - prompt: &'a Prompt, - possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: &'a EmbedderStats, - user_provided: &'a RefCell>, - threads: &'a ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - has_manual_generation: Option<&'a str>, + sender: EmbeddingSender<'doc, 'b>, + possible_embedding_mistakes: &'doc PossibleEmbeddingMistakes, } -impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { - #[allow(clippy::too_many_arguments)] - pub fn new( - embedder: &'a Embedder, - embedder_id: u8, - embedder_name: &'a str, - prompt: &'a Prompt, - user_provided: &'a RefCell>, - possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, - embedder_stats: &'a EmbedderStats, - threads: &'a ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - doc_alloc: &'a Bump, - ) -> Self { - let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); - let texts = BVec::with_capacity_in(capacity, doc_alloc); - let ids = BVec::with_capacity_in(capacity, doc_alloc); - let dimensions = embedder.dimensions(); - Self { - texts, - ids, - embedder, - prompt, - possible_embedding_mistakes, - embedder_stats, - threads, - sender, - embedder_id, - embedder_name, - user_provided, - has_manual_generation: None, - dimensions, - } +impl OnEmbeddingDocumentUpdates<'_, '_> { + fn clear_vectors(&self, docid: DocumentId) { + self.sender.set_vectors(docid, self.embedder_id, vec![]).unwrap(); } - pub fn set_autogenerated( + fn process_embeddings(&mut self, metadata: Metadata<'_>, embeddings: Vec) { + self.sender.set_vectors(metadata.docid, self.embedder_id, embeddings).unwrap(); + } +} + +impl<'doc> OnEmbed<'doc> for OnEmbeddingDocumentUpdates<'doc, '_> { + type ErrorMetadata = UnusedVectorsDistributionBump<'doc>; + fn process_embedding_response( &mut self, - docid: DocumentId, - external_docid: &'a str, - rendered: &'a str, - unused_vectors_distribution: &UnusedVectorsDistributionBump, - ) -> Result<()> { - let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_)); - if is_manual { - self.has_manual_generation.get_or_insert(external_docid); - } - - if self.texts.len() < self.texts.capacity() { - self.texts.push(rendered); - self.ids.push(docid); - return Ok(()); - } - - Self::embed_chunks( - &mut self.texts, - &mut self.ids, - self.embedder, - self.embedder_id, - self.embedder_name, - self.possible_embedding_mistakes, - self.embedder_stats, - unused_vectors_distribution, - self.threads, - self.sender, - self.has_manual_generation.take(), - ) + response: crate::vector::session::EmbeddingResponse<'doc>, + ) { + self.sender + .set_vector( + response.metadata.docid, + self.embedder_id, + response.metadata.extractor_id, + response.embedding, + ) + .unwrap(); } - - pub fn drain( - mut self, + fn process_embedding_error( + &mut self, + error: crate::vector::hf::EmbedError, + embedder_name: &'doc str, unused_vectors_distribution: &UnusedVectorsDistributionBump, - ) -> Result<()> { - let res = Self::embed_chunks( - &mut self.texts, - &mut self.ids, - self.embedder, - self.embedder_id, - self.embedder_name, - self.possible_embedding_mistakes, - self.embedder_stats, - unused_vectors_distribution, - self.threads, - self.sender, - self.has_manual_generation, - ); - // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff - std::mem::forget(self); - res - } - - #[allow(clippy::too_many_arguments)] - pub fn embed_chunks( - texts: &mut BVec<'a, &'a str>, - ids: &mut BVec<'a, DocumentId>, - embedder: &Embedder, - embedder_id: u8, - embedder_name: &str, - possible_embedding_mistakes: &PossibleEmbeddingMistakes, - embedder_stats: &EmbedderStats, - unused_vectors_distribution: &UnusedVectorsDistributionBump, - threads: &ThreadPoolNoAbort, - sender: EmbeddingSender<'a, 'b>, - has_manual_generation: Option<&'a str>, - ) -> Result<()> { - if let Some(external_docid) = has_manual_generation { - let mut msg = format!( - r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}", - external_docid, - if ids.len() > 1 { - format!(" and at least {} other document(s)", ids.len() - 1) - } else { - "".to_string() - } - ); - - msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + metadata: BVec<'doc, Metadata<'doc>>, + ) -> crate::Error { + if let FaultSource::Bug = error.fault { + crate::Error::InternalError(crate::InternalError::VectorEmbeddingError(error.into())) + } else { + let mut msg = if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + format!( + r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{} +- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.", + if let Some(first) = metadata.first() { first.external_docid } else { "???" }, + if metadata.len() > 1 { + format!(" and at least {} other document(s)", metadata.len() - 1) + } else { + "".to_string() + } + ) + } else { + format!(r"While embedding documents for embedder `{embedder_name}`: {error}") + }; let mut hint_count = 0; - for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2) + for (vector_misspelling, count) in + self.possible_embedding_mistakes.vector_mistakes().take(2) { msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); hint_count += 1; } - for (embedder_misspelling, count) in possible_embedding_mistakes + for (embedder_misspelling, count) in self + .possible_embedding_mistakes .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) .take(2) { @@ -659,107 +517,516 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { } if hint_count == 0 { - msg += &format!( - "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" - ); - } - - return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); - } - - let res = match embedder.embed_index_ref(texts.as_slice(), threads, embedder_stats) { - Ok(embeddings) => { - for (docid, embedding) in ids.into_iter().zip(embeddings) { - sender.set_vector(*docid, embedder_id, embedding).unwrap(); - } - Ok(()) - } - Err(error) => { - if let FaultSource::Bug = error.fault { - Err(crate::Error::InternalError(crate::InternalError::VectorEmbeddingError( - error.into(), - ))) - } else { - let mut msg = format!( - r"While embedding documents for embedder `{embedder_name}`: {error}" + if let EmbedErrorKind::ManualEmbed(_) = &error.kind { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" ); - - if let EmbedErrorKind::ManualEmbed(_) = &error.kind { - msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); - } - - let mut hint_count = 0; - - for (vector_misspelling, count) in - possible_embedding_mistakes.vector_mistakes().take(2) - { - msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); - hint_count += 1; - } - - for (embedder_misspelling, count) in possible_embedding_mistakes - .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) - .take(2) - { - msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); - hint_count += 1; - } - - if hint_count == 0 { - if let EmbedErrorKind::ManualEmbed(_) = &error.kind { - msg += &format!( - "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" - ); - } - } - - Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))) } } + + crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)) + } + } +} + +struct Chunks<'a, 'b, 'extractor> { + dimensions: usize, + status_delta: &'a RefCell>, + status: EmbeddingStatus, + kind: ChunkType<'a, 'b>, +} + +enum ChunkType<'a, 'b> { + DocumentTemplate { + document_template: &'a Prompt, + session: EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, &'a str>, + }, + Fragments { + fragments: &'a [RuntimeFragment], + session: EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, serde_json::Value>, + }, +} + +impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { + #[allow(clippy::too_many_arguments)] + pub fn new( + runtime: &'a RuntimeEmbedder, + embedder_info: EmbedderInfo, + embedder_name: &'a str, + status_delta: &'a RefCell>, + possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, + embedder_stats: &'a EmbedderStats, + threads: &'a ThreadPoolNoAbort, + sender: EmbeddingSender<'a, 'b>, + doc_alloc: &'a Bump, + ) -> Self { + let embedder = &runtime.embedder; + let dimensions = embedder.dimensions(); + + let fragments = runtime.fragments(); + let kind = if fragments.is_empty() { + ChunkType::DocumentTemplate { + document_template: &runtime.document_template, + session: EmbedSession::new( + &runtime.embedder, + embedder_name, + threads, + doc_alloc, + embedder_stats, + OnEmbeddingDocumentUpdates { + embedder_id: embedder_info.embedder_id, + sender, + possible_embedding_mistakes, + }, + ), + } + } else { + ChunkType::Fragments { + fragments, + session: EmbedSession::new( + &runtime.embedder, + embedder_name, + threads, + doc_alloc, + embedder_stats, + OnEmbeddingDocumentUpdates { + embedder_id: embedder_info.embedder_id, + sender, + possible_embedding_mistakes, + }, + ), + } }; - texts.clear(); - ids.clear(); - res + + Self { dimensions, status: embedder_info.embedding_status, status_delta, kind } } - pub fn prompt(&self) -> &'a Prompt { - self.prompt + pub fn is_user_provided_must_regenerate(&self, docid: DocumentId) -> (bool, bool) { + self.status.is_user_provided_must_regenerate(docid) + } + + #[allow(clippy::too_many_arguments)] + pub fn settings_change_autogenerated<'doc, D: Document<'doc> + Debug, SD: SettingsDelta>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + document: D, + settings_delta: &SD, + fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + old_is_user_provided: bool, + full_reindex: bool, + ) -> Result<()> + where + 'a: 'doc, + { + match &mut self.kind { + ChunkType::Fragments { fragments: _, session } => { + let doc_alloc = session.doc_alloc(); + + if old_is_user_provided | full_reindex { + session.on_embed_mut().clear_vectors(docid); + } + + settings_delta.try_for_each_fragment_diff( + session.embedder_name(), + |fragment_diff| { + let extractor = RequestFragmentExtractor::new(fragment_diff.new, doc_alloc) + .ignore_errors(); + let old = if full_reindex { + None + } else { + fragment_diff.old.map(|old| { + RequestFragmentExtractor::new(old, doc_alloc).ignore_errors() + }) + }; + let metadata = Metadata { + docid, + external_docid, + extractor_id: extractor.extractor_id(), + }; + + match extractor.diff_settings(&document, &(), old.as_ref())? { + ExtractorDiff::Removed => { + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { + metadata, + embedding: None, + }, + ); + } + ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + session.request_embedding( + metadata, + input, + unused_vectors_distribution, + )?; + } + ExtractorDiff::Unchanged => { /* nothing to do */ } + } + + Result::Ok(()) + }, + )?; + self.set_status(docid, old_is_user_provided, true, false, true); + } + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + + let old_embedder = settings_delta.old_embedders().get(session.embedder_name()); + let old_document_template = if full_reindex { + None + } else { + old_embedder.as_ref().map(|old_embedder| &old_embedder.document_template) + }; + let extractor = + DocumentTemplateExtractor::new(document_template, doc_alloc, fields_ids_map); + let old_extractor = old_document_template.map(|old_document_template| { + DocumentTemplateExtractor::new(old_document_template, doc_alloc, fields_ids_map) + }); + let metadata = + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; + + match extractor.diff_settings(document, &external_docid, old_extractor.as_ref())? { + ExtractorDiff::Removed => { + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { metadata, embedding: None }, + ); + } + ExtractorDiff::Added(input) | ExtractorDiff::Updated(input) => { + session.request_embedding(metadata, input, unused_vectors_distribution)?; + } + ExtractorDiff::Unchanged => { /* do nothing */ } + } + self.set_status(docid, old_is_user_provided, true, false, true); + } + } + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + pub fn update_autogenerated<'doc, OD: Document<'doc> + Debug, ND: Document<'doc> + Debug>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + old_document: OD, + new_document: ND, + new_fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_must_regenerate: bool, + ) -> Result<()> + where + 'a: 'doc, + { + match &mut self.kind { + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + let ex = DocumentTemplateExtractor::new( + document_template, + doc_alloc, + new_fields_ids_map, + ); + + if old_is_user_provided { + session.on_embed_mut().clear_vectors(docid); + } + + update_autogenerated( + docid, + external_docid, + [ex], + old_document, + new_document, + &external_docid, + old_must_regenerate, + session, + unused_vectors_distribution, + )? + } + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + + if old_is_user_provided { + session.on_embed_mut().clear_vectors(docid); + } + + update_autogenerated( + docid, + external_docid, + extractors, + old_document, + new_document, + &(), + old_must_regenerate, + session, + unused_vectors_distribution, + )? + } + }; + + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + false, + new_must_regenerate, + ); + + Ok(()) + } + + #[allow(clippy::too_many_arguments)] + pub fn insert_autogenerated + Debug>( + &mut self, + docid: DocumentId, + external_docid: &'a str, + new_document: D, + new_fields_ids_map: &'a RefCell, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, + new_must_regenerate: bool, + ) -> Result<()> { + let (default_is_user_provided, default_must_regenerate) = (false, true); + self.set_status( + docid, + default_is_user_provided, + default_must_regenerate, + false, + new_must_regenerate, + ); + + match &mut self.kind { + ChunkType::DocumentTemplate { document_template, session } => { + let doc_alloc = session.doc_alloc(); + let ex = DocumentTemplateExtractor::new( + document_template, + doc_alloc, + new_fields_ids_map, + ); + + insert_autogenerated( + docid, + external_docid, + [ex], + new_document, + &external_docid, + session, + unused_vectors_distribution, + )?; + } + ChunkType::Fragments { fragments, session } => { + let doc_alloc = session.doc_alloc(); + let extractors = fragments.iter().map(|fragment| { + RequestFragmentExtractor::new(fragment, doc_alloc).ignore_errors() + }); + + insert_autogenerated( + docid, + external_docid, + extractors, + new_document, + &(), + session, + unused_vectors_distribution, + )?; + } + } + Ok(()) + } + + pub fn drain(self, unused_vectors_distribution: &UnusedVectorsDistributionBump) -> Result<()> { + match self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.drain(unused_vectors_distribution)?; + } + ChunkType::Fragments { fragments: _, session } => { + session.drain(unused_vectors_distribution)?; + } + } + Ok(()) } pub fn embedder_name(&self) -> &'a str { - self.embedder_name - } - - fn set_regenerate(&self, docid: DocumentId, regenerate: bool) { - let mut user_provided = self.user_provided.borrow_mut(); - let user_provided = user_provided.0.entry_ref(self.embedder_name).or_default(); - if regenerate { - // regenerate == !user_provided - user_provided.insert_del_u32(docid); - } else { - user_provided.insert_add_u32(docid); + match &self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.embedder_name() + } + ChunkType::Fragments { fragments: _, session } => session.embedder_name(), } } - fn set_vectors( + fn set_status( &self, + docid: DocumentId, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) { + if EmbeddingStatusDelta::needs_change( + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ) { + let mut status_delta = self.status_delta.borrow_mut(); + let status_delta = status_delta.0.entry_ref(self.embedder_name()).or_default(); + status_delta.push_delta( + docid, + old_is_user_provided, + old_must_regenerate, + new_is_user_provided, + new_must_regenerate, + ); + } + } + + pub fn clear_status(&self, docid: DocumentId, is_user_provided: bool, must_regenerate: bool) { + // these value ensure both roaring are at 0. + if EmbeddingStatusDelta::needs_clear(is_user_provided, must_regenerate) { + let mut status_delta = self.status_delta.borrow_mut(); + let status_delta = status_delta.0.entry_ref(self.embedder_name()).or_default(); + status_delta.clear_docid(docid, is_user_provided, must_regenerate); + } + } + + pub fn set_vectors( + &mut self, external_docid: &'a str, docid: DocumentId, embeddings: Vec, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_must_regenerate: bool, ) -> Result<()> { + self.set_status( + docid, + old_is_user_provided, + old_must_regenerate, + true, + new_must_regenerate, + ); for (embedding_index, embedding) in embeddings.iter().enumerate() { if embedding.len() != self.dimensions { return Err(UserError::InvalidIndexingVectorDimensions { expected: self.dimensions, found: embedding.len(), - embedder_name: self.embedder_name.to_string(), + embedder_name: self.embedder_name().to_string(), document_id: external_docid.to_string(), embedding_index, } .into()); } } - self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap(); + match &mut self.kind { + ChunkType::DocumentTemplate { document_template: _, session } => { + session.on_embed_mut().process_embeddings( + Metadata { docid, external_docid, extractor_id: 0 }, + embeddings, + ); + } + ChunkType::Fragments { fragments: _, session } => { + session.on_embed_mut().process_embeddings( + Metadata { docid, external_docid, extractor_id: 0 }, + embeddings, + ); + } + } + Ok(()) } + + fn has_fragments(&self) -> bool { + matches!(self.kind, ChunkType::Fragments { .. }) + } +} + +#[allow(clippy::too_many_arguments)] +fn update_autogenerated<'doc, 'a: 'doc, 'b, E, OD, ND>( + docid: DocumentId, + external_docid: &'a str, + extractors: impl IntoIterator, + old_document: OD, + new_document: ND, + meta: &E::DocumentMetadata, + old_must_regenerate: bool, + session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, +) -> Result<()> +where + OD: Document<'doc> + Debug, + ND: Document<'doc> + Debug, + E: VectorExtractor<'a>, + E::Input: Input, + crate::Error: From, +{ + for extractor in extractors { + let new_rendered = extractor.extract(&new_document, meta)?; + let must_regenerate = if !old_must_regenerate { + // we just enabled `regenerate` + true + } else { + let old_rendered = extractor.extract(&old_document, meta); + + if let Ok(old_rendered) = old_rendered { + // must regenerate if the rendered changed + new_rendered != old_rendered + } else { + // cannot check previous rendered, better regenerate + true + } + }; + + if must_regenerate { + let metadata = + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }; + + if let Some(new_rendered) = new_rendered { + session.request_embedding(metadata, new_rendered, unused_vectors_distribution)? + } else { + // remove any existing embedding + OnEmbed::process_embedding_response( + session.on_embed_mut(), + crate::vector::session::EmbeddingResponse { metadata, embedding: None }, + ); + } + } + } + + Ok(()) +} + +fn insert_autogenerated<'a, 'b, E, D: Document<'a> + Debug>( + docid: DocumentId, + external_docid: &'a str, + extractors: impl IntoIterator, + new_document: D, + meta: &E::DocumentMetadata, + session: &mut EmbedSession<'a, OnEmbeddingDocumentUpdates<'a, 'b>, E::Input>, + unused_vectors_distribution: &UnusedVectorsDistributionBump<'a>, +) -> Result<()> +where + E: VectorExtractor<'a>, + E::Input: Input, + crate::Error: From, +{ + for extractor in extractors { + let new_rendered = extractor.extract(&new_document, meta)?; + + if let Some(new_rendered) = new_rendered { + session.request_embedding( + Metadata { docid, external_docid, extractor_id: extractor.extractor_id() }, + new_rendered, + unused_vectors_distribution, + )?; + } + } + + Ok(()) } diff --git a/crates/milli/src/update/new/indexer/extract.rs b/crates/milli/src/update/new/indexer/extract.rs index bb275d8aa..abfb4d6da 100644 --- a/crates/milli/src/update/new/indexer/extract.rs +++ b/crates/milli/src/update/new/indexer/extract.rs @@ -13,21 +13,17 @@ use super::super::thread_local::{FullySend, ThreadLocal}; use super::super::FacetFieldIdsDelta; use super::document_changes::{extract, DocumentChanges, IndexingContext}; use super::settings_changes::settings_change_extract; -use crate::documents::FieldIdMapper; -use crate::documents::PrimaryKey; -use crate::index::IndexEmbeddingConfig; -use crate::progress::EmbedderStats; -use crate::progress::MergingWordCache; +use crate::documents::{FieldIdMapper, PrimaryKey}; +use crate::progress::{EmbedderStats, MergingWordCache}; use crate::proximity::ProximityPrecision; use crate::update::new::extract::EmbeddingExtractor; use crate::update::new::indexer::settings_changes::DocumentsIndentifiers; use crate::update::new::merger::merge_and_send_rtree; use crate::update::new::{merge_and_send_docids, merge_and_send_facet_docids, FacetDatabases}; use crate::update::settings::SettingsDelta; -use crate::vector::EmbeddingConfigs; -use crate::Index; -use crate::InternalError; -use crate::{Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; +use crate::vector::db::{EmbedderInfo, IndexEmbeddingConfig}; +use crate::vector::RuntimeEmbedders; +use crate::{Index, InternalError, Result, ThreadPoolNoAbort, ThreadPoolNoAbortBuilder}; #[allow(clippy::too_many_arguments)] pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( @@ -35,7 +31,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>( indexing_context: IndexingContext, indexer_span: Span, extractor_sender: ExtractorBbqueueSender, - embedders: &EmbeddingConfigs, + embedders: &RuntimeEmbedders, extractor_allocs: &'extractor mut ThreadLocal>, finished_extraction: &AtomicBool, field_distribution: &mut BTreeMap, @@ -275,14 +271,19 @@ where let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let _entered = span.enter(); + let embedder_configs = index.embedding_configs(); for config in &mut index_embeddings { + let mut infos = embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap(); + 'data: for data in datastore.iter_mut() { let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { + let Some(delta) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided, modified_docids); + delta.apply_to(&mut infos.embedding_status); } + + extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap(); } } } @@ -332,12 +333,11 @@ pub(super) fn extract_all_settings_changes( finished_extraction: &AtomicBool, field_distribution: &mut BTreeMap, mut index_embeddings: Vec, - modified_docids: &mut RoaringBitmap, embedder_stats: &EmbedderStats, ) -> Result> where MSP: Fn() -> bool + Sync, - SD: SettingsDelta, + SD: SettingsDelta + Sync, { // Create the list of document ids to extract let rtxn = indexing_context.index.read_txn()?; @@ -368,10 +368,7 @@ where // extract the remaining embeddings let extractor = SettingsChangeEmbeddingExtractor::new( - settings_delta.new_embedders(), - settings_delta.old_embedders(), - settings_delta.embedder_actions(), - settings_delta.new_embedder_category_id(), + settings_delta, embedder_stats, embedding_sender, field_distribution, @@ -395,14 +392,25 @@ where let span = tracing::debug_span!(target: "indexing::documents::merge", "vectors"); let _entered = span.enter(); + let embedder_configs = indexing_context.index.embedding_configs(); for config in &mut index_embeddings { + // retrieve infos for existing embedder or create a fresh one + let mut infos = + embedder_configs.embedder_info(&rtxn, &config.name)?.unwrap_or_else(|| { + let embedder_id = + *settings_delta.new_embedder_category_id().get(&config.name).unwrap(); + EmbedderInfo { embedder_id, embedding_status: Default::default() } + }); + 'data: for data in datastore.iter_mut() { let data = &mut data.get_mut().0; - let Some(deladd) = data.remove(&config.name) else { + let Some(delta) = data.remove(&config.name) else { continue 'data; }; - deladd.apply_to(&mut config.user_provided, modified_docids); + delta.apply_to(&mut infos.embedding_status); } + + extractor_sender.embeddings().embedding_status(&config.name, infos).unwrap(); } } } diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 0efef48fd..a6ba3a919 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -23,8 +23,8 @@ use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::progress::{EmbedderStats, Progress}; use crate::update::settings::SettingsDelta; use crate::update::GrenadParameters; -use crate::vector::settings::{EmbedderAction, WriteBackToDocuments}; -use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs}; +use crate::vector::settings::{EmbedderAction, RemoveFragments, WriteBackToDocuments}; +use crate::vector::{ArroyWrapper, Embedder, RuntimeEmbedders}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result, ThreadPoolNoAbort}; pub(crate) mod de; @@ -54,7 +54,7 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP>( new_fields_ids_map: FieldsIdsMap, new_primary_key: Option>, document_changes: &DC, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, must_stop_processing: &'indexer MSP, progress: &'indexer Progress, embedder_stats: &'indexer EmbedderStats, @@ -93,7 +93,7 @@ where grenad_parameters: &grenad_parameters, }; - let index_embeddings = index.embedding_configs(wtxn)?; + let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; let mut document_ids = index.documents_ids(wtxn)?; let mut modified_docids = roaring::RoaringBitmap::new(); @@ -133,20 +133,21 @@ where let arroy_writers: Result> = embedders .inner_as_ref() .iter() - .map(|(embedder_name, (embedder, _, was_quantized))| { - let embedder_index = index.embedder_category_id.get(wtxn, embedder_name)?.ok_or( - InternalError::DatabaseMissingEntry { + .map(|(embedder_name, runtime)| { + let embedder_index = index + .embedding_configs() + .embedder_id(wtxn, embedder_name)? + .ok_or(InternalError::DatabaseMissingEntry { db_name: "embedder_category_id", key: None, - }, - )?; + })?; - let dimensions = embedder.dimensions(); - let writer = ArroyWrapper::new(vector_arroy, embedder_index, *was_quantized); + let dimensions = runtime.embedder.dimensions(); + let writer = ArroyWrapper::new(vector_arroy, embedder_index, runtime.is_quantized); Ok(( embedder_index, - (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + (embedder_name.as_str(), &*runtime.embedder, writer, dimensions), )) }) .collect(); @@ -220,7 +221,7 @@ where MSP: Fn() -> bool + Sync, SD: SettingsDelta + Sync, { - delete_old_embedders(wtxn, index, settings_delta)?; + delete_old_embedders_and_fragments(wtxn, index, settings_delta)?; let mut bbbuffers = Vec::new(); let finished_extraction = AtomicBool::new(false); @@ -253,16 +254,14 @@ where grenad_parameters: &grenad_parameters, }; - let index_embeddings = index.embedding_configs(wtxn)?; + let index_embeddings = index.embedding_configs().embedding_configs(wtxn)?; let mut field_distribution = index.field_distribution(wtxn)?; - let mut modified_docids = roaring::RoaringBitmap::new(); let congestion = thread::scope(|s| -> Result { let indexer_span = tracing::Span::current(); let finished_extraction = &finished_extraction; // prevent moving the field_distribution and document_ids in the inner closure... let field_distribution = &mut field_distribution; - let modified_docids = &mut modified_docids; let extractor_handle = Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || { pool.install(move || { @@ -275,7 +274,6 @@ where finished_extraction, field_distribution, index_embeddings, - modified_docids, &embedder_stats, ) }) @@ -341,7 +339,7 @@ where fn arroy_writers_from_embedder_actions<'indexer>( index: &Index, embedder_actions: &'indexer BTreeMap, - embedders: &'indexer EmbeddingConfigs, + embedders: &'indexer RuntimeEmbedders, index_embedder_category_ids: &'indexer std::collections::HashMap, ) -> Result> { let vector_arroy = index.vector_arroy; @@ -349,7 +347,7 @@ fn arroy_writers_from_embedder_actions<'indexer>( embedders .inner_as_ref() .iter() - .filter_map(|(embedder_name, (embedder, _, _))| match embedder_actions.get(embedder_name) { + .filter_map(|(embedder_name, runtime)| match embedder_actions.get(embedder_name) { None => None, Some(action) if action.write_back().is_some() => None, Some(action) => { @@ -364,25 +362,65 @@ fn arroy_writers_from_embedder_actions<'indexer>( }; let writer = ArroyWrapper::new(vector_arroy, embedder_category_id, action.was_quantized); - let dimensions = embedder.dimensions(); + let dimensions = runtime.embedder.dimensions(); Some(Ok(( embedder_category_id, - (embedder_name.as_str(), embedder.as_ref(), writer, dimensions), + (embedder_name.as_str(), runtime.embedder.as_ref(), writer, dimensions), ))) } }) .collect() } -fn delete_old_embedders(wtxn: &mut RwTxn<'_>, index: &Index, settings_delta: &SD) -> Result<()> +fn delete_old_embedders_and_fragments( + wtxn: &mut RwTxn<'_>, + index: &Index, + settings_delta: &SD, +) -> Result<()> where SD: SettingsDelta, { for action in settings_delta.embedder_actions().values() { - if let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() { - let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); - let dimensions = reader.dimensions(wtxn)?; - reader.clear(wtxn, dimensions)?; + let Some(WriteBackToDocuments { embedder_id, .. }) = action.write_back() else { + continue; + }; + let reader = ArroyWrapper::new(index.vector_arroy, *embedder_id, action.was_quantized); + let Some(dimensions) = reader.dimensions(wtxn)? else { + continue; + }; + reader.clear(wtxn, dimensions)?; + } + + // remove all vectors for the specified fragments + for (embedder_name, RemoveFragments { fragment_ids }, was_quantized) in + settings_delta.embedder_actions().iter().filter_map(|(name, action)| { + action.remove_fragments().map(|fragments| (name, fragments, action.was_quantized)) + }) + { + let Some(infos) = index.embedding_configs().embedder_info(wtxn, embedder_name)? else { + continue; + }; + let arroy = ArroyWrapper::new(index.vector_arroy, infos.embedder_id, was_quantized); + let Some(dimensions) = arroy.dimensions(wtxn)? else { + continue; + }; + for fragment_id in fragment_ids { + // we must keep the user provided embeddings that ended up in this store + + if infos.embedding_status.user_provided_docids().is_empty() { + // no user provided: clear store + arroy.clear_store(wtxn, *fragment_id, dimensions)?; + continue; + } + + // some user provided, remove only the ids that are not user provided + let to_delete = arroy.items_in_store(wtxn, *fragment_id, |items| { + items - infos.embedding_status.user_provided_docids() + })?; + + for to_delete in to_delete { + arroy.del_item_in_store(wtxn, to_delete, *fragment_id, dimensions)?; + } } } diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index fa48ff589..b8e3685f8 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -11,11 +11,11 @@ use super::super::channel::*; use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; -use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::update::settings::InnerIndexSettings; +use crate::vector::db::IndexEmbeddingConfig; use crate::vector::settings::EmbedderAction; -use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; +use crate::vector::{ArroyWrapper, Embedder, Embeddings, RuntimeEmbedders}; use crate::{Error, Index, InternalError, Result, UserError}; pub fn write_to_db( @@ -64,6 +64,14 @@ pub fn write_to_db( writer.del_items(wtxn, *dimensions, docid)?; writer.add_items(wtxn, docid, &embeddings)?; } + ReceiverAction::LargeVector( + large_vector @ LargeVector { docid, embedder_id, extractor_id, .. }, + ) => { + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let embedding = large_vector.read_embedding(*dimensions); + writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; + } } // Every time the is a message in the channel we search @@ -137,7 +145,7 @@ where )?; } - index.put_embedding_configs(wtxn, index_embeddings)?; + index.embedding_configs().put_embedding_configs(wtxn, index_embeddings)?; Ok(()) } @@ -147,7 +155,7 @@ pub(super) fn update_index( wtxn: &mut RwTxn<'_>, new_fields_ids_map: FieldIdMapWithMetadata, new_primary_key: Option>, - embedders: EmbeddingConfigs, + embedders: RuntimeEmbedders, field_distribution: std::collections::BTreeMap, document_ids: roaring::RoaringBitmap, ) -> Result<()> { @@ -226,14 +234,36 @@ pub fn write_from_bbqueue( arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); - if embeddings.append(all_embeddings.to_vec()).is_err() { - return Err(Error::UserError(UserError::InvalidVectorDimensions { - expected: *dimensions, - found: all_embeddings.len(), - })); - } writer.del_items(wtxn, *dimensions, docid)?; - writer.add_items(wtxn, docid, &embeddings)?; + if !all_embeddings.is_empty() { + if embeddings.append(all_embeddings.to_vec()).is_err() { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: all_embeddings.len(), + })); + } + writer.add_items(wtxn, docid, &embeddings)?; + } + } + EntryHeader::ArroySetVector( + asv @ ArroySetVector { docid, embedder_id, extractor_id, .. }, + ) => { + let frame = frame_with_header.frame(); + let (_, _, writer, dimensions) = + arroy_writers.get(&embedder_id).expect("requested a missing embedder"); + let embedding = asv.read_all_embeddings_into_vec(frame, aligned_embedding); + + if embedding.is_empty() { + writer.del_item_in_store(wtxn, docid, extractor_id, *dimensions)?; + } else { + if embedding.len() != *dimensions { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: embedding.len(), + })); + } + writer.add_item_in_store(wtxn, docid, extractor_id, embedding)?; + } } } } diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index a52dab6a1..b59984248 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -12,9 +12,9 @@ use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions}; use super::indexer::de::DeserrRawValue; use crate::constants::RESERVED_VECTORS_FIELD_NAME; use crate::documents::FieldIdMapper; -use crate::index::IndexEmbeddingConfig; +use crate::vector::db::{EmbeddingStatus, IndexEmbeddingConfig}; use crate::vector::parsed_vectors::{RawVectors, RawVectorsError, VectorOrArrayOfVectors}; -use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfigs}; +use crate::vector::{ArroyWrapper, Embedding, RuntimeEmbedders}; use crate::{DocumentId, Index, InternalError, Result, UserError}; #[derive(Serialize)] @@ -109,7 +109,7 @@ impl<'t> VectorDocumentFromDb<'t> { None => None, }; - let embedding_config = index.embedding_configs(rtxn)?; + let embedding_config = index.embedding_configs().embedding_configs(rtxn)?; Ok(Some(Self { docid, embedding_config, index, vectors_field, rtxn, doc_alloc })) } @@ -118,6 +118,7 @@ impl<'t> VectorDocumentFromDb<'t> { &self, embedder_id: u8, config: &IndexEmbeddingConfig, + status: &EmbeddingStatus, ) -> Result> { let reader = ArroyWrapper::new(self.index.vector_arroy, embedder_id, config.config.quantized()); @@ -126,7 +127,7 @@ impl<'t> VectorDocumentFromDb<'t> { Ok(VectorEntry { has_configured_embedder: true, embeddings: Some(Embeddings::FromDb(vectors)), - regenerate: !config.user_provided.contains(self.docid), + regenerate: status.must_regenerate(self.docid), implicit: false, }) } @@ -137,9 +138,9 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { self.embedding_config .iter() .map(|config| { - let embedder_id = - self.index.embedder_category_id.get(self.rtxn, &config.name)?.unwrap(); - let entry = self.entry_from_db(embedder_id, config)?; + let info = + self.index.embedding_configs().embedder_info(self.rtxn, &config.name)?.unwrap(); + let entry = self.entry_from_db(info.embedder_id, config, &info.embedding_status)?; let config_name = self.doc_alloc.alloc_str(config.name.as_str()); Ok((&*config_name, entry)) }) @@ -156,11 +157,11 @@ impl<'t> VectorDocument<'t> for VectorDocumentFromDb<'t> { } fn vectors_for_key(&self, key: &str) -> Result>> { - Ok(match self.index.embedder_category_id.get(self.rtxn, key)? { - Some(embedder_id) => { + Ok(match self.index.embedding_configs().embedder_info(self.rtxn, key)? { + Some(info) => { let config = self.embedding_config.iter().find(|config| config.name == key).unwrap(); - Some(self.entry_from_db(embedder_id, config)?) + Some(self.entry_from_db(info.embedder_id, config, &info.embedding_status)?) } None => match self.vectors_field.as_ref().and_then(|obkv| obkv.get(key)) { Some(embedding_from_doc) => { @@ -222,7 +223,7 @@ fn entry_from_raw_value( pub struct VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, vectors: RawMap<'doc, FxBuildHasher>, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, } impl<'doc> VectorDocumentFromVersions<'doc> { @@ -230,7 +231,7 @@ impl<'doc> VectorDocumentFromVersions<'doc> { external_document_id: &'doc str, versions: &Versions<'doc>, bump: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let document = DocumentFromVersions::new(versions); if let Some(vectors_field) = document.vectors_field()? { @@ -283,7 +284,7 @@ impl<'doc> MergedVectorDocument<'doc> { db_fields_ids_map: &'doc Mapper, versions: &Versions<'doc>, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; let new_doc = @@ -295,7 +296,7 @@ impl<'doc> MergedVectorDocument<'doc> { external_document_id: &'doc str, versions: &Versions<'doc>, doc_alloc: &'doc Bump, - embedders: &'doc EmbeddingConfigs, + embedders: &'doc RuntimeEmbedders, ) -> Result> { let Some(new_doc) = VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)? diff --git a/crates/milli/src/update/settings.rs b/crates/milli/src/update/settings.rs index c6ede7a1d..911f51865 100644 --- a/crates/milli/src/update/settings.rs +++ b/crates/milli/src/update/settings.rs @@ -7,7 +7,6 @@ use std::sync::Arc; use charabia::{Normalize, Tokenizer, TokenizerBuilder}; use deserr::{DeserializeError, Deserr}; use itertools::{merge_join_by, EitherOrBoth, Itertools}; -use roaring::RoaringBitmap; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -23,22 +22,25 @@ use crate::error::UserError::{self, InvalidChatSettingsDocumentTemplateMaxBytes} use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder}; use crate::filterable_attributes_rules::match_faceted_field; use crate::index::{ - ChatConfig, IndexEmbeddingConfig, PrefixSearch, SearchParameters, - DEFAULT_MIN_WORD_LEN_ONE_TYPO, DEFAULT_MIN_WORD_LEN_TWO_TYPOS, + ChatConfig, PrefixSearch, SearchParameters, DEFAULT_MIN_WORD_LEN_ONE_TYPO, + DEFAULT_MIN_WORD_LEN_TWO_TYPOS, }; use crate::order_by_map::OrderByMap; -use crate::progress::EmbedderStats; -use crate::progress::Progress; +use crate::progress::{EmbedderStats, Progress}; use crate::prompt::{default_max_bytes, default_template_text, PromptData}; use crate::proximity::ProximityPrecision; use crate::update::index_documents::IndexDocumentsMethod; use crate::update::new::indexer::reindex; use crate::update::{IndexDocuments, UpdateIndexingStep}; +use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig}; +use crate::vector::json_template::JsonTemplate; use crate::vector::settings::{ - EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, - SubEmbeddingSettings, WriteBackToDocuments, + EmbedderAction, EmbedderSource, EmbeddingSettings, EmbeddingValidationContext, NestingContext, + ReindexAction, SubEmbeddingSettings, WriteBackToDocuments, +}; +use crate::vector::{ + Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, }; -use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::{ ChannelCongestion, FieldId, FilterableAttributesRule, Index, LocalizedAttributesRule, Result, }; @@ -1044,22 +1046,27 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { match std::mem::take(&mut self.embedder_settings) { Setting::Set(configs) => self.update_embedding_configs_set(configs), Setting::Reset => { + let embedders = self.index.embedding_configs(); // all vectors should be written back to documents - let old_configs = self.index.embedding_configs(self.wtxn)?; + let old_configs = embedders.embedding_configs(self.wtxn)?; let remove_all: Result> = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_provided }| -> Result<_> { - let embedder_id = - self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( - crate::InternalError::DatabaseMissingEntry { - db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, - key: None, - }, - )?; + .map(|IndexEmbeddingConfig { name, config, fragments: _ }| -> Result<_> { + let embedder_info = embedders.embedder_info(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; Ok(( name, EmbedderAction::with_write_back( - WriteBackToDocuments { embedder_id, user_provided }, + WriteBackToDocuments { + embedder_id: embedder_info.embedder_id, + user_provided: embedder_info + .embedding_status + .into_user_provided(), + }, config.quantized(), ), )) @@ -1069,7 +1076,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { let remove_all = remove_all?; self.index.embedder_category_id.clear(self.wtxn)?; - self.index.delete_embedding_configs(self.wtxn)?; + embedders.delete_embedding_configs(self.wtxn)?; Ok(remove_all) } Setting::NotSet => Ok(Default::default()), @@ -1081,12 +1088,12 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { configs: BTreeMap>, ) -> Result> { use crate::vector::settings::SettingsDiff; - - let old_configs = self.index.embedding_configs(self.wtxn)?; - let old_configs: BTreeMap = old_configs + let embedders = self.index.embedding_configs(); + let old_configs = embedders.embedding_configs(self.wtxn)?; + let old_configs: BTreeMap = old_configs .into_iter() - .map(|IndexEmbeddingConfig { name, config, user_provided }| { - (name, (config.into(), user_provided)) + .map(|IndexEmbeddingConfig { name, config, fragments }| { + (name, (config.into(), fragments)) }) .collect(); let mut updated_configs = BTreeMap::new(); @@ -1097,71 +1104,111 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { { match joined { // updated config - EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => { + EitherOrBoth::Both((name, (old, mut fragments)), (_, new)) => { let was_quantized = old.binary_quantized.set().unwrap_or_default(); let settings_diff = SettingsDiff::from_settings(&name, old, new)?; match settings_diff { SettingsDiff::Remove => { + let info = embedders.remove_embedder(self.wtxn, &name)?.ok_or( + crate::InternalError::DatabaseMissingEntry { + db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, + key: None, + }, + )?; tracing::debug!( embedder = name, - user_provided = user_provided.len(), + user_provided = info.embedding_status.user_provided_docids().len(), "removing embedder" ); - let embedder_id = - self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or( - crate::InternalError::DatabaseMissingEntry { - db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID, - key: None, - }, - )?; - // free id immediately - self.index.embedder_category_id.delete(self.wtxn, &name)?; embedder_actions.insert( name, EmbedderAction::with_write_back( - WriteBackToDocuments { embedder_id, user_provided }, + WriteBackToDocuments { + embedder_id: info.embedder_id, + user_provided: info.embedding_status.into_user_provided(), + }, was_quantized, ), ); } SettingsDiff::Reindex { action, updated_settings, quantize } => { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - ?action, - "reindex embedder" - ); - embedder_actions.insert( - name.clone(), + let mut remove_fragments = None; + let updated_settings = Setting::Set(updated_settings); + if let ReindexAction::RegenerateFragments(regenerate_fragments) = + &action + { + let it = regenerate_fragments + .iter() + .filter(|(_, action)| { + matches!( + action, + crate::vector::settings::RegenerateFragment::Remove + ) + }) + .map(|(name, _)| name.as_str()); + + remove_fragments = fragments.remove_fragments(it); + + let it = regenerate_fragments + .iter() + .filter(|(_, action)| { + matches!( + action, + crate::vector::settings::RegenerateFragment::Add + ) + }) + .map(|(name, _)| name.clone()); + fragments.add_new_fragments(it)?; + } else { + // needs full reindex of fragments + fragments = FragmentConfigs::new(); + fragments.add_new_fragments( + crate::vector::settings::fragments_from_settings( + &updated_settings, + ), + )?; + } + tracing::debug!(embedder = name, ?action, "reindex embedder"); + + let embedder_action = EmbedderAction::with_reindex(action, was_quantized) - .with_is_being_quantized(quantize), - ); - let new = - validate_embedding_settings(Setting::Set(updated_settings), &name)?; - updated_configs.insert(name, (new, user_provided)); + .with_is_being_quantized(quantize); + + let embedder_action = if let Some(remove_fragments) = remove_fragments { + embedder_action.with_remove_fragments(remove_fragments) + } else { + embedder_action + }; + + embedder_actions.insert(name.clone(), embedder_action); + let new = validate_embedding_settings( + updated_settings, + &name, + EmbeddingValidationContext::FullSettings, + )?; + updated_configs.insert(name, (new, fragments)); } SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => { - tracing::debug!( - embedder = name, - user_provided = user_provided.len(), - "update without reindex embedder" - ); - let new = - validate_embedding_settings(Setting::Set(updated_settings), &name)?; + tracing::debug!(embedder = name, "update without reindex embedder"); + let new = validate_embedding_settings( + Setting::Set(updated_settings), + &name, + EmbeddingValidationContext::FullSettings, + )?; if quantize { embedder_actions.insert( name.clone(), EmbedderAction::default().with_is_being_quantized(true), ); } - updated_configs.insert(name, (new, user_provided)); + updated_configs.insert(name, (new, fragments)); } } } // unchanged config - EitherOrBoth::Left((name, (setting, user_provided))) => { + EitherOrBoth::Left((name, (setting, fragments))) => { tracing::debug!(embedder = name, "unchanged embedder"); - updated_configs.insert(name, (Setting::Set(setting), user_provided)); + updated_configs.insert(name, (Setting::Set(setting), fragments)); } // new config EitherOrBoth::Right((name, mut setting)) => { @@ -1171,52 +1218,51 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> { crate::vector::settings::EmbeddingSettings::apply_default_openai_model( &mut setting, ); - let setting = validate_embedding_settings(setting, &name)?; + let setting = validate_embedding_settings( + setting, + &name, + EmbeddingValidationContext::FullSettings, + )?; embedder_actions.insert( name.clone(), EmbedderAction::with_reindex(ReindexAction::FullReindex, false), ); - updated_configs.insert(name, (setting, RoaringBitmap::new())); + let mut fragments = FragmentConfigs::new(); + fragments.add_new_fragments( + crate::vector::settings::fragments_from_settings(&setting), + )?; + updated_configs.insert(name, (setting, fragments)); } } } - let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; - for res in self.index.embedder_category_id.iter(self.wtxn)? { - let (_name, id) = res?; - free_indices[id as usize] = false; - } - let mut free_indices = free_indices.iter_mut().enumerate(); - let mut find_free_index = - move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); - for (name, action) in embedder_actions.iter() { - // ignore actions that are not possible for a new embedder - if matches!(action.reindex(), Some(ReindexAction::FullReindex)) - && self.index.embedder_category_id.get(self.wtxn, name)?.is_none() - { - let id = - find_free_index().ok_or(UserError::TooManyEmbedders(updated_configs.len()))?; - tracing::debug!(embedder = name, id, "assigning free id to new embedder"); - self.index.embedder_category_id.put(self.wtxn, name, &id)?; - } - } + embedders.add_new_embedders( + self.wtxn, + embedder_actions + .iter() + // ignore actions that are not possible for a new embedder, most critically deleted embedders + .filter(|(_, action)| matches!(action.reindex(), Some(ReindexAction::FullReindex))) + .map(|(name, _)| name.as_str()), + updated_configs.len(), + )?; + let updated_configs: Vec = updated_configs .into_iter() - .filter_map(|(name, (config, user_provided))| match config { + .filter_map(|(name, (config, fragments))| match config { Setting::Set(config) => { - Some(IndexEmbeddingConfig { name, config: config.into(), user_provided }) + Some(IndexEmbeddingConfig { name, config: config.into(), fragments }) } Setting::Reset => None, Setting::NotSet => Some(IndexEmbeddingConfig { name, config: EmbeddingSettings::default().into(), - user_provided, + fragments: Default::default(), }), }) .collect(); if updated_configs.is_empty() { - self.index.delete_embedding_configs(self.wtxn)?; + embedders.delete_embedding_configs(self.wtxn)?; } else { - self.index.put_embedding_configs(self.wtxn, updated_configs)?; + embedders.put_embedding_configs(self.wtxn, updated_configs)?; } Ok(embedder_actions) } @@ -1543,6 +1589,7 @@ pub struct InnerIndexSettingsDiff { /// The set of only the additional searchable fields. /// If any other searchable field has been modified, is set to None. pub(crate) only_additional_fields: Option>, + fragment_diffs: BTreeMap, usize)>>, // Cache the check to see if all the stop_words, allowed_separators, dictionary, // exact_attributes, proximity_precision are different. @@ -1611,13 +1658,13 @@ impl InnerIndexSettingsDiff { // if the user-defined searchables changed, then we need to reindex prompts. if cache_user_defined_searchables { - for (embedder_name, (config, _, _quantized)) in - new_settings.embedding_configs.inner_as_ref() - { - let was_quantized = - old_settings.embedding_configs.get(embedder_name).is_some_and(|conf| conf.2); + for (embedder_name, runtime) in new_settings.runtime_embedders.inner_as_ref() { + let was_quantized = old_settings + .runtime_embedders + .get(embedder_name) + .is_some_and(|conf| conf.is_quantized); // skip embedders that don't use document templates - if !config.uses_document_template() { + if !runtime.embedder.uses_document_template() { continue; } @@ -1630,22 +1677,86 @@ impl InnerIndexSettingsDiff { was_quantized, )); } - std::collections::btree_map::Entry::Occupied(entry) => { + std::collections::btree_map::Entry::Occupied(mut entry) => { + // future-proofing, make sure to destructure here so that any new field is taken into account in this case + // case in point: adding `remove_fragments` was detected. let EmbedderAction { was_quantized: _, is_being_quantized: _, - write_back: _, // We are deleting this embedder, so no point in regeneration - reindex: _, // We are already fully reindexing - } = entry.get(); + write_back, // We are deleting this embedder, so no point in regeneration + reindex, + remove_fragments: _, + } = entry.get_mut(); + + // fixup reindex to make sure we regenerate all fragments + *reindex = match reindex.take() { + Some(reindex) => Some(reindex), // We are at least regenerating prompts + None => { + if write_back.is_none() { + Some(ReindexAction::RegeneratePrompts) // quantization case + } else { + None + } + } + }; } }; } } + // build the fragment diffs + let mut fragment_diffs = BTreeMap::new(); + for (embedder_name, embedder_action) in &embedding_config_updates { + let Some(new_embedder) = new_settings.runtime_embedders.get(embedder_name) else { + continue; + }; + let regenerate_fragments = + if let Some(ReindexAction::RegenerateFragments(regenerate_fragments)) = + embedder_action.reindex() + { + either::Either::Left( + regenerate_fragments + .iter() + .filter(|(_, action)| { + !matches!( + action, + crate::vector::settings::RegenerateFragment::Remove + ) + }) + .map(|(name, _)| name), + ) + } else { + either::Either::Right( + new_embedder.fragments().iter().map(|fragment| &fragment.name), + ) + }; + + let old_embedder = old_settings.runtime_embedders.get(embedder_name); + + let mut fragments = Vec::new(); + for fragment_name in regenerate_fragments { + let Ok(new) = new_embedder + .fragments() + .binary_search_by_key(&fragment_name, |fragment| &fragment.name) + else { + continue; + }; + let old = old_embedder.as_ref().and_then(|old_embedder| { + old_embedder + .fragments() + .binary_search_by_key(&fragment_name, |fragment| &fragment.name) + .ok() + }); + fragments.push((old, new)); + } + fragment_diffs.insert(embedder_name.clone(), fragments); + } + InnerIndexSettingsDiff { old: old_settings, new: new_settings, primary_key_id, + fragment_diffs, embedding_config_updates, settings_update_only, only_additional_fields, @@ -1790,7 +1901,7 @@ pub(crate) struct InnerIndexSettings { pub exact_attributes: HashSet, pub disabled_typos_terms: DisabledTyposTerms, pub proximity_precision: ProximityPrecision, - pub embedding_configs: EmbeddingConfigs, + pub runtime_embedders: RuntimeEmbedders, pub embedder_category_id: HashMap, pub geo_fields_ids: Option<(FieldId, FieldId)>, pub prefix_search: PrefixSearch, @@ -1801,7 +1912,7 @@ impl InnerIndexSettings { pub fn from_index( index: &Index, rtxn: &heed::RoTxn<'_>, - embedding_configs: Option, + runtime_embedders: Option, ) -> Result { let stop_words = index.stop_words(rtxn)?; let stop_words = stop_words.map(|sw| sw.map_data(Vec::from).unwrap()); @@ -1810,13 +1921,13 @@ impl InnerIndexSettings { let mut fields_ids_map = index.fields_ids_map(rtxn)?; let exact_attributes = index.exact_attributes_ids(rtxn)?; let proximity_precision = index.proximity_precision(rtxn)?.unwrap_or_default(); - let embedding_configs = match embedding_configs { + let runtime_embedders = match runtime_embedders { Some(embedding_configs) => embedding_configs, - None => embedders(index.embedding_configs(rtxn)?)?, + None => embedders(index.embedding_configs().embedding_configs(rtxn)?)?, }; let embedder_category_id = index - .embedder_category_id - .iter(rtxn)? + .embedding_configs() + .iter_embedder_id(rtxn)? .map(|r| r.map(|(k, v)| (k.to_string(), v))) .collect::>()?; let prefix_search = index.prefix_search(rtxn)?.unwrap_or_default(); @@ -1857,7 +1968,7 @@ impl InnerIndexSettings { sortable_fields, exact_attributes, proximity_precision, - embedding_configs, + runtime_embedders, embedder_category_id, geo_fields_ids, prefix_search, @@ -1900,28 +2011,49 @@ impl InnerIndexSettings { } } -fn embedders(embedding_configs: Vec) -> Result { +fn embedders(embedding_configs: Vec) -> Result { let res: Result<_> = embedding_configs .into_iter() .map( |IndexEmbeddingConfig { name, config: EmbeddingConfig { embedder_options, prompt, quantized }, - .. + fragments, }| { - let prompt = Arc::new(prompt.try_into().map_err(crate::Error::from)?); + let document_template = prompt.try_into().map_err(crate::Error::from)?; - let embedder = Arc::new( + let embedder = // cache_cap: no cache needed for indexing purposes - Embedder::new(embedder_options.clone(), 0) + Arc::new(Embedder::new(embedder_options.clone(), 0) .map_err(crate::vector::Error::from) - .map_err(crate::Error::from)?, - ); - Ok((name, (embedder, prompt, quantized.unwrap_or_default()))) + .map_err(crate::Error::from)?); + + let fragments = fragments + .into_inner() + .into_iter() + .map(|fragment| { + let template = JsonTemplate::new( + embedder_options.fragment(&fragment.name).unwrap().clone(), + ) + .unwrap(); + + RuntimeFragment { name: fragment.name, id: fragment.id, template } + }) + .collect(); + + Ok(( + name, + Arc::new(RuntimeEmbedder::new( + embedder, + document_template, + fragments, + quantized.unwrap_or_default(), + )), + )) }, ) .collect(); - res.map(EmbeddingConfigs::new) + res.map(RuntimeEmbedders::new) } fn validate_prompt( @@ -1958,6 +2090,7 @@ fn validate_prompt( pub fn validate_embedding_settings( settings: Setting, name: &str, + context: EmbeddingValidationContext, ) -> Result> { let Setting::Set(settings) = settings else { return Ok(settings) }; let EmbeddingSettings { @@ -1970,6 +2103,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -1996,9 +2131,106 @@ pub fn validate_embedding_settings( })?; } + // used below + enum WithFragments { + Yes { + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + }, + No, + Maybe, + } + + let with_fragments = { + let has_reset = matches!(indexing_fragments, Setting::Reset) + || matches!(search_fragments, Setting::Reset); + let indexing_fragments: BTreeMap<_, _> = indexing_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(); + let search_fragments: BTreeMap<_, _> = search_fragments + .as_ref() + .set() + .iter() + .flat_map(|map| map.iter()) + .filter_map(|(name, fragment)| { + Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) + }) + .collect(); + + let has_fragments = !indexing_fragments.is_empty() || !search_fragments.is_empty(); + + if context == EmbeddingValidationContext::FullSettings { + let are_fragments_inconsistent = + indexing_fragments.is_empty() ^ search_fragments.is_empty(); + if are_fragments_inconsistent { + return Err(crate::vector::error::NewEmbedderError::rest_inconsistent_fragments( + indexing_fragments.is_empty(), + indexing_fragments, + search_fragments, + )) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); + } + } + if has_fragments { + if context == EmbeddingValidationContext::SettingsPartialUpdate + && matches!(document_template, Setting::Set(_)) + { + return Err( + crate::vector::error::NewEmbedderError::rest_document_template_and_fragments( + indexing_fragments.len(), + search_fragments.len(), + ), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into()); + } + WithFragments::Yes { indexing_fragments, search_fragments } + } else if has_reset || context == EmbeddingValidationContext::FullSettings { + WithFragments::No + } else { + // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments + WithFragments::Maybe + } + }; if let Some(request) = request.as_ref().set() { - let request = crate::vector::rest::Request::new(request.to_owned()) - .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; + let request = match with_fragments { + WithFragments::Yes { indexing_fragments, search_fragments } => { + crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + search_fragments, + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + } + WithFragments::No => crate::vector::rest::RequestData::new( + request.to_owned(), + Default::default(), + Default::default(), + ) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())), + WithFragments::Maybe => { + let mut indexing_fragments = BTreeMap::new(); + indexing_fragments.insert("test".to_string(), serde_json::json!("test")); + crate::vector::rest::RequestData::new( + request.to_owned(), + indexing_fragments, + Default::default(), + ) + .or_else(|_| { + crate::vector::rest::RequestData::new( + request.to_owned(), + Default::default(), + Default::default(), + ) + }) + .map_err(|error| crate::UserError::VectorEmbeddingError(error.into())) + } + }?; if let Some(response) = response.as_ref().set() { crate::vector::rest::Response::new(response.to_owned(), &request) .map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; @@ -2017,6 +2249,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -2036,6 +2270,8 @@ pub fn validate_embedding_settings( &dimensions, &api_key, &url, + &indexing_fragments, + &search_fragments, &request, &response, &document_template, @@ -2114,6 +2350,8 @@ pub fn validate_embedding_settings( &embedder.dimensions, &embedder.api_key, &embedder.url, + &embedder.indexing_fragments, + &embedder.search_fragments, &embedder.request, &embedder.response, &embedder.document_template, @@ -2169,6 +2407,8 @@ pub fn validate_embedding_settings( &embedder.dimensions, &embedder.api_key, &embedder.url, + &embedder.indexing_fragments, + &embedder.search_fragments, &embedder.request, &embedder.response, &embedder.document_template, @@ -2201,6 +2441,8 @@ pub fn validate_embedding_settings( document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -2231,20 +2473,32 @@ fn deserialize_sub_embedder( /// Implement this trait for the settings delta type. /// This is used in the new settings update flow and will allow to easily replace the old settings delta type: `InnerIndexSettingsDiff`. pub trait SettingsDelta { - fn new_embedders(&self) -> &EmbeddingConfigs; - fn old_embedders(&self) -> &EmbeddingConfigs; + fn new_embedders(&self) -> &RuntimeEmbedders; + fn old_embedders(&self) -> &RuntimeEmbedders; fn new_embedder_category_id(&self) -> &HashMap; fn embedder_actions(&self) -> &BTreeMap; + fn try_for_each_fragment_diff( + &self, + embedder_name: &str, + for_each: F, + ) -> std::result::Result<(), E> + where + F: FnMut(FragmentDiff) -> std::result::Result<(), E>; fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata; } +pub struct FragmentDiff<'a> { + pub old: Option<&'a RuntimeFragment>, + pub new: &'a RuntimeFragment, +} + impl SettingsDelta for InnerIndexSettingsDiff { - fn new_embedders(&self) -> &EmbeddingConfigs { - &self.new.embedding_configs + fn new_embedders(&self) -> &RuntimeEmbedders { + &self.new.runtime_embedders } - fn old_embedders(&self) -> &EmbeddingConfigs { - &self.old.embedding_configs + fn old_embedders(&self) -> &RuntimeEmbedders { + &self.old.runtime_embedders } fn new_embedder_category_id(&self) -> &HashMap { @@ -2258,6 +2512,37 @@ impl SettingsDelta for InnerIndexSettingsDiff { fn new_fields_ids_map(&self) -> &FieldIdMapWithMetadata { &self.new.fields_ids_map } + + fn try_for_each_fragment_diff( + &self, + embedder_name: &str, + mut for_each: F, + ) -> std::result::Result<(), E> + where + F: FnMut(FragmentDiff) -> std::result::Result<(), E>, + { + let Some(fragment_diff) = self.fragment_diffs.get(embedder_name) else { return Ok(()) }; + for (old, new) in fragment_diff { + let Some(new_runtime) = self.new.runtime_embedders.get(embedder_name) else { + continue; + }; + + let new = new_runtime.fragments().get(*new).unwrap(); + + match old { + Some(old) => { + if let Some(old_runtime) = self.old.runtime_embedders.get(embedder_name) { + let old = &old_runtime.fragments().get(*old).unwrap(); + for_each(FragmentDiff { old: Some(old), new })?; + } else { + for_each(FragmentDiff { old: None, new })?; + } + } + None => for_each(FragmentDiff { old: None, new })?, + }; + } + Ok(()) + } } #[cfg(test)] diff --git a/crates/milli/src/vector/db.rs b/crates/milli/src/vector/db.rs new file mode 100644 index 000000000..0e890fac9 --- /dev/null +++ b/crates/milli/src/vector/db.rs @@ -0,0 +1,443 @@ +//! Module containing types and methods to store meta-information about the embedders and fragments + +use std::borrow::Cow; + +use byteorder::{BigEndian, ReadBytesExt, WriteBytesExt}; +use heed::types::{SerdeJson, Str, U8}; +use heed::{BytesEncode, Database, RoTxn, RwTxn, Unspecified}; +use roaring::RoaringBitmap; +use serde::{Deserialize, Serialize}; + +use crate::vector::settings::RemoveFragments; +use crate::vector::EmbeddingConfig; +use crate::{CboRoaringBitmapCodec, DocumentId, UserError}; + +#[derive(Debug, Deserialize, Serialize)] +pub struct IndexEmbeddingConfig { + pub name: String, + pub config: EmbeddingConfig, + #[serde(default)] + pub fragments: FragmentConfigs, +} + +#[derive(Debug, Clone, Deserialize, Serialize, Default)] +pub struct FragmentConfigs(Vec); + +impl FragmentConfigs { + pub fn new() -> Self { + Default::default() + } + pub fn as_slice(&self) -> &[FragmentConfig] { + self.0.as_slice() + } + + pub fn into_inner(self) -> Vec { + self.0 + } + + pub fn remove_fragments<'a>( + &mut self, + fragments: impl IntoIterator, + ) -> Option { + let mut remove_fragments = Vec::new(); + for fragment in fragments { + let Ok(index_to_remove) = self.0.binary_search_by_key(&fragment, |f| &f.name) else { + continue; + }; + let fragment = self.0.swap_remove(index_to_remove); + remove_fragments.push(fragment.id); + } + (!remove_fragments.is_empty()).then_some(RemoveFragments { fragment_ids: remove_fragments }) + } + + pub fn add_new_fragments( + &mut self, + new_fragments: impl IntoIterator, + ) -> crate::Result<()> { + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + + for FragmentConfig { id, name: _ } in self.0.iter() { + free_indices[*id as usize] = false; + } + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + + let mut new_fragments = new_fragments.into_iter(); + + for name in &mut new_fragments { + let id = match find_free_index() { + Some(id) => id, + None => { + let more = (&mut new_fragments).count(); + return Err(UserError::TooManyFragments(u8::MAX as usize + more + 1).into()); + } + }; + self.0.push(FragmentConfig { id, name }); + } + Ok(()) + } +} + +#[derive(Debug, Clone, Deserialize, Serialize)] +pub struct FragmentConfig { + pub id: u8, + pub name: String, +} + +pub struct IndexEmbeddingConfigs { + main: Database, + embedder_info: Database, +} + +pub struct EmbedderInfo { + pub embedder_id: u8, + pub embedding_status: EmbeddingStatus, +} + +impl EmbedderInfo { + pub fn to_bytes(&self) -> Result, heed::BoxedError> { + EmbedderInfoCodec::bytes_encode(self) + } +} + +/// Optimized struct to hold the list of documents that are `user_provided` and `must_regenerate`. +/// +/// Because most documents have the same value for `user_provided` and `must_regenerate`, we store only +/// the `user_provided` and a list of the documents for which `must_regenerate` assumes the other value +/// than `user_provided`. +#[derive(Default)] +pub struct EmbeddingStatus { + user_provided: RoaringBitmap, + skip_regenerate_different_from_user_provided: RoaringBitmap, +} + +impl EmbeddingStatus { + pub fn new() -> Self { + Default::default() + } + + /// Whether the document contains user-provided vectors for that embedder. + pub fn is_user_provided(&self, docid: DocumentId) -> bool { + self.user_provided.contains(docid) + } + /// Whether vectors should be regenerated for that document and that embedder. + pub fn must_regenerate(&self, docid: DocumentId) -> bool { + let invert = self.skip_regenerate_different_from_user_provided.contains(docid); + let user_provided = self.user_provided.contains(docid); + !(user_provided ^ invert) + } + + pub fn is_user_provided_must_regenerate(&self, docid: DocumentId) -> (bool, bool) { + let invert = self.skip_regenerate_different_from_user_provided.contains(docid); + let user_provided = self.user_provided.contains(docid); + (user_provided, !(user_provided ^ invert)) + } + + pub fn user_provided_docids(&self) -> &RoaringBitmap { + &self.user_provided + } + + pub fn skip_regenerate_docids(&self) -> RoaringBitmap { + &self.user_provided ^ &self.skip_regenerate_different_from_user_provided + } + + pub(crate) fn into_user_provided(self) -> RoaringBitmap { + self.user_provided + } +} + +#[derive(Default)] +pub struct EmbeddingStatusDelta { + del_status: EmbeddingStatus, + add_status: EmbeddingStatus, +} + +impl EmbeddingStatusDelta { + pub fn new() -> Self { + Self::default() + } + + pub fn needs_change( + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) -> bool { + let old_skip_regenerate_different_user_provided = + old_is_user_provided == old_must_regenerate; + let new_skip_regenerate_different_user_provided = + new_is_user_provided == new_must_regenerate; + + old_is_user_provided != new_is_user_provided + || old_skip_regenerate_different_user_provided + != new_skip_regenerate_different_user_provided + } + + pub fn needs_clear(is_user_provided: bool, must_regenerate: bool) -> bool { + Self::needs_change(is_user_provided, must_regenerate, false, true) + } + + pub fn clear_docid( + &mut self, + docid: DocumentId, + is_user_provided: bool, + must_regenerate: bool, + ) { + self.push_delta(docid, is_user_provided, must_regenerate, false, true); + } + + pub fn push_delta( + &mut self, + docid: DocumentId, + old_is_user_provided: bool, + old_must_regenerate: bool, + new_is_user_provided: bool, + new_must_regenerate: bool, + ) { + // must_regenerate == !skip_regenerate + let old_skip_regenerate_different_user_provided = + old_is_user_provided == old_must_regenerate; + let new_skip_regenerate_different_user_provided = + new_is_user_provided == new_must_regenerate; + + match (old_is_user_provided, new_is_user_provided) { + (true, true) | (false, false) => { /* no change */ } + (true, false) => { + self.del_status.user_provided.insert(docid); + } + (false, true) => { + self.add_status.user_provided.insert(docid); + } + } + + match ( + old_skip_regenerate_different_user_provided, + new_skip_regenerate_different_user_provided, + ) { + (true, true) | (false, false) => { /* no change */ } + (true, false) => { + self.del_status.skip_regenerate_different_from_user_provided.insert(docid); + } + (false, true) => { + self.add_status.skip_regenerate_different_from_user_provided.insert(docid); + } + } + } + + pub fn push_new(&mut self, docid: DocumentId, is_user_provided: bool, must_regenerate: bool) { + self.push_delta( + docid, + !is_user_provided, + !must_regenerate, + is_user_provided, + must_regenerate, + ); + } + + pub fn apply_to(&self, status: &mut EmbeddingStatus) { + status.user_provided -= &self.del_status.user_provided; + status.user_provided |= &self.add_status.user_provided; + + status.skip_regenerate_different_from_user_provided -= + &self.del_status.skip_regenerate_different_from_user_provided; + status.skip_regenerate_different_from_user_provided |= + &self.add_status.skip_regenerate_different_from_user_provided; + } +} + +struct EmbedderInfoCodec; + +impl<'a> heed::BytesDecode<'a> for EmbedderInfoCodec { + type DItem = EmbedderInfo; + + fn bytes_decode(mut bytes: &'a [u8]) -> Result { + let embedder_id = bytes.read_u8()?; + // Support all version that didn't store the embedding status + if bytes.is_empty() { + return Ok(EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() }); + } + let first_bitmap_size = bytes.read_u32::()?; + let first_bitmap_bytes = &bytes[..first_bitmap_size as usize]; + let user_provided = CboRoaringBitmapCodec::bytes_decode(first_bitmap_bytes)?; + let skip_regenerate_different_from_user_provided = + CboRoaringBitmapCodec::bytes_decode(&bytes[first_bitmap_size as usize..])?; + Ok(EmbedderInfo { + embedder_id, + embedding_status: EmbeddingStatus { + user_provided, + skip_regenerate_different_from_user_provided, + }, + }) + } +} + +impl<'a> heed::BytesEncode<'a> for EmbedderInfoCodec { + type EItem = EmbedderInfo; + + fn bytes_encode(item: &'a Self::EItem) -> Result, heed::BoxedError> { + let first_bitmap_size = + CboRoaringBitmapCodec::serialized_size(&item.embedding_status.user_provided); + let second_bitmap_size = CboRoaringBitmapCodec::serialized_size( + &item.embedding_status.skip_regenerate_different_from_user_provided, + ); + + let mut bytes = Vec::with_capacity(1 + 4 + first_bitmap_size + second_bitmap_size); + bytes.write_u8(item.embedder_id)?; + bytes.write_u32::(first_bitmap_size.try_into()?)?; + CboRoaringBitmapCodec::serialize_into_writer( + &item.embedding_status.user_provided, + &mut bytes, + )?; + CboRoaringBitmapCodec::serialize_into_writer( + &item.embedding_status.skip_regenerate_different_from_user_provided, + &mut bytes, + )?; + Ok(bytes.into()) + } +} + +impl IndexEmbeddingConfigs { + pub(crate) fn new( + main: Database, + embedder_info: Database, + ) -> Self { + Self { main, embedder_info: embedder_info.remap_types() } + } + + pub(crate) fn put_embedding_configs( + &self, + wtxn: &mut RwTxn<'_>, + configs: Vec, + ) -> heed::Result<()> { + self.main.remap_types::>>().put( + wtxn, + crate::index::main_key::EMBEDDING_CONFIGS, + &configs, + ) + } + + pub(crate) fn delete_embedding_configs(&self, wtxn: &mut RwTxn<'_>) -> heed::Result { + self.main.remap_key_type::().delete(wtxn, crate::index::main_key::EMBEDDING_CONFIGS) + } + + pub fn embedding_configs(&self, rtxn: &RoTxn<'_>) -> heed::Result> { + Ok(self + .main + .remap_types::>>() + .get(rtxn, crate::index::main_key::EMBEDDING_CONFIGS)? + .unwrap_or_default()) + } + + pub fn embedder_id(&self, rtxn: &RoTxn<'_>, name: &str) -> heed::Result> { + self.embedder_info.remap_data_type::().get(rtxn, name) + } + + pub fn put_fresh_embedder_id( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + embedder_id: u8, + ) -> heed::Result<()> { + let info = EmbedderInfo { embedder_id, embedding_status: EmbeddingStatus::new() }; + self.put_embedder_info(wtxn, name, &info) + } + + /// Iterate through the passed list of embedder names, associating a fresh embedder id to any new names. + /// + /// Passing the name of a currently existing embedder is not an error, and will not modify its embedder id, + /// so it is not necessary to differentiate between new and existing embedders before calling this function. + pub fn add_new_embedders<'a>( + &self, + wtxn: &mut RwTxn<'_>, + embedder_names: impl IntoIterator, + total_embedder_count: usize, + ) -> crate::Result<()> { + let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize]; + + for res in self.embedder_info.iter(wtxn)? { + let (_name, EmbedderInfo { embedder_id, embedding_status: _ }) = res?; + free_indices[embedder_id as usize] = false; + } + + let mut free_indices = free_indices.iter_mut().enumerate(); + let mut find_free_index = + move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8); + + for embedder_name in embedder_names { + if self.embedder_id(wtxn, embedder_name)?.is_some() { + continue; + } + let embedder_id = find_free_index() + .ok_or(crate::UserError::TooManyEmbedders(total_embedder_count))?; + tracing::debug!( + embedder = embedder_name, + embedder_id, + "assigning free id to new embedder" + ); + self.put_fresh_embedder_id(wtxn, embedder_name, embedder_id)?; + } + Ok(()) + } + + pub fn embedder_info( + &self, + rtxn: &RoTxn<'_>, + name: &str, + ) -> heed::Result> { + self.embedder_info.get(rtxn, name) + } + + /// Clear the list of docids that are `user_provided` or `must_regenerate` across all embedders. + pub fn clear_embedder_info_docids(&self, wtxn: &mut RwTxn<'_>) -> heed::Result<()> { + let mut it = self.embedder_info.iter_mut(wtxn)?; + while let Some(res) = it.next() { + let (embedder_name, info) = res?; + let embedder_name = embedder_name.to_owned(); + // SAFETY: we copied the `embedder_name` so are not using the reference while using put + unsafe { + it.put_current( + &embedder_name, + &EmbedderInfo { + embedder_id: info.embedder_id, + embedding_status: EmbeddingStatus::new(), + }, + )?; + } + } + Ok(()) + } + + pub fn iter_embedder_info<'a>( + &self, + rtxn: &'a RoTxn<'_>, + ) -> heed::Result>> { + self.embedder_info.iter(rtxn) + } + + pub fn iter_embedder_id<'a>( + &self, + rtxn: &'a RoTxn<'_>, + ) -> heed::Result>> { + self.embedder_info.remap_data_type::().iter(rtxn) + } + + pub fn remove_embedder( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + ) -> heed::Result> { + let info = self.embedder_info.get(wtxn, name)?; + self.embedder_info.delete(wtxn, name)?; + Ok(info) + } + + pub fn put_embedder_info( + &self, + wtxn: &mut RwTxn<'_>, + name: &str, + info: &EmbedderInfo, + ) -> heed::Result<()> { + self.embedder_info.put(wtxn, name, info) + } +} diff --git a/crates/milli/src/vector/error.rs b/crates/milli/src/vector/error.rs index 685022de8..0d737cbfc 100644 --- a/crates/milli/src/vector/error.rs +++ b/crates/milli/src/vector/error.rs @@ -3,6 +3,7 @@ use std::path::PathBuf; use bumpalo::Bump; use hf_hub::api::sync::ApiError; +use itertools::Itertools as _; use super::parsed_vectors::ParsedVectorsDiff; use super::rest::ConfigurationSource; @@ -101,6 +102,32 @@ pub enum EmbedErrorKind { MissingEmbedding, #[error(transparent)] PanicInThreadPool(#[from] PanicCatched), + #[error("`media` requested but the configuration doesn't have source `rest`")] + RestMediaNotARest, + #[error("`media` requested, and the configuration has source `rest`, but the configuration doesn't have `searchFragments`.")] + RestMediaNotAFragment, + + #[error("Query matches multiple search fragments.\n - Note: First matched fragment `{name}`.\n - Note: Second matched fragment `{second_name}`.\n - Note: {}", + { + serde_json::json!({ + "q": q, + "media": media + }) + })] + RestSearchMatchesMultipleFragments { + name: String, + second_name: String, + q: Option, + media: Option, + }, + #[error("Query matches no search fragment.\n - Note: {}", + { + serde_json::json!({ + "q": q, + "media": media + }) + })] + RestSearchMatchesNoFragment { q: Option, media: Option }, } fn option_info(info: Option<&str>, prefix: &str) -> String { @@ -210,6 +237,44 @@ impl EmbedError { pub(crate) fn rest_extraction_error(error: String) -> EmbedError { Self { kind: EmbedErrorKind::RestExtractionError(error), fault: FaultSource::Runtime } } + + pub(crate) fn rest_media_not_a_rest() -> EmbedError { + Self { kind: EmbedErrorKind::RestMediaNotARest, fault: FaultSource::User } + } + + pub(crate) fn rest_media_not_a_fragment() -> EmbedError { + Self { kind: EmbedErrorKind::RestMediaNotAFragment, fault: FaultSource::User } + } + + pub(crate) fn rest_search_matches_multiple_fragments( + name: &str, + second_name: &str, + q: Option<&str>, + media: Option<&serde_json::Value>, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestSearchMatchesMultipleFragments { + name: name.to_string(), + second_name: second_name.to_string(), + q: q.map(String::from), + media: media.cloned(), + }, + fault: FaultSource::User, + } + } + + pub(crate) fn rest_search_matches_no_fragment( + q: Option<&str>, + media: Option<&serde_json::Value>, + ) -> EmbedError { + Self { + kind: EmbedErrorKind::RestSearchMatchesNoFragment { + q: q.map(String::from), + media: media.cloned(), + }, + fault: FaultSource::User, + } + } } #[derive(Debug, thiserror::Error)] @@ -382,6 +447,49 @@ impl NewEmbedderError { fault: FaultSource::User, } } + + pub(crate) fn rest_cannot_infer_dimensions_for_fragment() -> NewEmbedderError { + Self { + kind: NewEmbedderErrorKind::RestCannotInferDimensionsForFragment, + fault: FaultSource::User, + } + } + + pub(crate) fn rest_inconsistent_fragments( + indexing_fragments_is_empty: bool, + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + ) -> NewEmbedderError { + let message = if indexing_fragments_is_empty { + format!("`indexingFragments` is empty, but `searchFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `indexingFragments` or remove fragments from `searchFragments` by setting them to `null`", + search_fragments.len(), + search_fragments.keys().take(3).join(", "), if search_fragments.len() > 3 { ", ..." } else { "" } + ) + } else { + format!("`searchFragments` is empty, but `indexingFragments` declares {} fragments: {}{}\n - Hint: declare at least one fragment in `searchFragments` or remove fragments from `indexingFragments` by setting them to `null`", + indexing_fragments.len(), + indexing_fragments.keys().take(3).join(", "), if indexing_fragments.len() > 3 { ", ..." } else { "" } + ) + }; + + Self { + kind: NewEmbedderErrorKind::RestInconsistentFragments { message }, + fault: FaultSource::User, + } + } + + pub(crate) fn rest_document_template_and_fragments( + indexing_fragments_len: usize, + search_fragments_len: usize, + ) -> Self { + Self { + kind: NewEmbedderErrorKind::RestDocumentTemplateAndFragments { + indexing_fragments_len, + search_fragments_len, + }, + fault: FaultSource::User, + } + } } #[derive(Debug, Clone, Copy)] @@ -499,6 +607,12 @@ pub enum NewEmbedderErrorKind { CompositeEmbeddingCountMismatch { search_count: usize, index_count: usize }, #[error("error while generating test embeddings.\n - the embeddings produced at search time and indexing time are not similar enough.\n - angular distance {distance:.2}\n - Meilisearch requires a maximum distance of {MAX_COMPOSITE_DISTANCE}.\n - Note: check that both embedders produce similar embeddings.{hint}")] CompositeEmbeddingValueMismatch { distance: f32, hint: CompositeEmbedderContainsHuggingFace }, + #[error("cannot infer `dimensions` for an embedder using `indexingFragments`.\n - Note: Specify `dimensions` explicitly or don't use `indexingFragments`.")] + RestCannotInferDimensionsForFragment, + #[error("inconsistent fragments: {message}")] + RestInconsistentFragments { message: String }, + #[error("cannot pass both fragments and a document template.\n - Note: {indexing_fragments_len} fragments declared in `indexingFragments` and {search_fragments_len} fragments declared in `search_fragments_len`.\n - Hint: remove the declared fragments or remove the `documentTemplate`")] + RestDocumentTemplateAndFragments { indexing_fragments_len: usize, search_fragments_len: usize }, } pub struct PossibleEmbeddingMistakes { diff --git a/crates/milli/src/vector/extractor.rs b/crates/milli/src/vector/extractor.rs new file mode 100644 index 000000000..2ab541ac1 --- /dev/null +++ b/crates/milli/src/vector/extractor.rs @@ -0,0 +1,244 @@ +use std::cell::RefCell; +use std::collections::BTreeMap; +use std::fmt::Debug; + +use bumpalo::Bump; +use serde_json::Value; + +use super::json_template::{self, JsonTemplate}; +use crate::prompt::error::RenderPromptError; +use crate::prompt::Prompt; +use crate::update::new::document::Document; +use crate::vector::RuntimeFragment; +use crate::GlobalFieldsIdsMap; + +/// Trait for types that extract embedder inputs from a document. +/// +/// An embedder input can then be sent to an embedder by using an [`super::session::EmbedSession`]. +pub trait Extractor<'doc> { + /// The embedder input that is extracted from documents by this extractor. + /// + /// The inputs have to be comparable for equality so that diffing is possible. + type Input: PartialEq; + + /// The error that can happen while extracting from a document. + type Error; + + /// Metadata associated with a document. + type DocumentMetadata; + + /// Extract the embedder input from a document and its metadata. + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error>; + + /// Unique `id` associated with this extractor. + /// + /// This will serve to decide where to store the vectors in the vector store. + /// The id should be stable for a given extractor. + fn extractor_id(&self) -> u8; + + /// The result of diffing the embedder inputs extracted from two versions of a document. + /// + /// # Parameters + /// + /// - `old`: old version of the document + /// - `new`: new version of the document + /// - `meta`: metadata associated to the document + fn diff_documents<'a, OD: Document<'a> + Debug, ND: Document<'a> + Debug>( + &self, + old: OD, + new: ND, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> + where + 'doc: 'a, + { + let old_input = self.extract(old, meta); + let new_input = self.extract(new, meta); + to_diff(old_input, new_input) + } + + /// The result of diffing the embedder inputs extracted from a document by two versions of this extractor. + /// + /// # Parameters + /// + /// - `doc`: the document from which to extract the embedder inputs + /// - `meta`: metadata associated to the document + /// - `old`: If `Some`, the old version of this extractor. If `None`, this is equivalent to calling `ExtractorDiff::Added(self.extract(_))`. + fn diff_settings<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + old: Option<&Self>, + ) -> Result, Self::Error> { + let old_input = if let Some(old) = old { old.extract(&doc, meta) } else { Ok(None) }; + let new_input = self.extract(&doc, meta); + + to_diff(old_input, new_input) + } + + /// Returns an extractor wrapping `self` and set to ignore all errors arising from extracting with this extractor. + fn ignore_errors(self) -> IgnoreErrorExtractor + where + Self: Sized, + { + IgnoreErrorExtractor(self) + } +} + +fn to_diff( + old_input: Result, E>, + new_input: Result, E>, +) -> Result, E> { + let old_input = old_input.ok().unwrap_or(None); + let new_input = new_input?; + Ok(match (old_input, new_input) { + (Some(old), Some(new)) if old == new => ExtractorDiff::Unchanged, + (None, None) => ExtractorDiff::Unchanged, + (None, Some(input)) => ExtractorDiff::Added(input), + (Some(_), None) => ExtractorDiff::Removed, + (Some(_), Some(input)) => ExtractorDiff::Updated(input), + }) +} + +pub enum ExtractorDiff { + Removed, + Added(Input), + Updated(Input), + Unchanged, +} + +impl ExtractorDiff { + pub fn into_input(self) -> Option { + match self { + ExtractorDiff::Removed => None, + ExtractorDiff::Added(input) => Some(input), + ExtractorDiff::Updated(input) => Some(input), + ExtractorDiff::Unchanged => None, + } + } + + pub fn needs_change(&self) -> bool { + match self { + ExtractorDiff::Removed => true, + ExtractorDiff::Added(_) => true, + ExtractorDiff::Updated(_) => true, + ExtractorDiff::Unchanged => false, + } + } + + pub fn into_list_of_changes( + named_diffs: impl IntoIterator, + ) -> BTreeMap> { + named_diffs + .into_iter() + .filter(|(_, diff)| diff.needs_change()) + .map(|(name, diff)| (name, diff.into_input())) + .collect() + } +} + +pub struct DocumentTemplateExtractor<'a, 'b, 'c> { + doc_alloc: &'a Bump, + field_id_map: &'a RefCell>, + template: &'c Prompt, +} + +impl<'a, 'b, 'c> DocumentTemplateExtractor<'a, 'b, 'c> { + pub fn new( + template: &'c Prompt, + doc_alloc: &'a Bump, + field_id_map: &'a RefCell>, + ) -> Self { + Self { template, doc_alloc, field_id_map } + } +} + +impl<'doc> Extractor<'doc> for DocumentTemplateExtractor<'doc, '_, '_> { + type DocumentMetadata = &'doc str; + type Input = &'doc str; + type Error = RenderPromptError; + + fn extractor_id(&self) -> u8 { + 0 + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + external_docid: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(Some(self.template.render_document( + external_docid, + doc, + self.field_id_map, + self.doc_alloc, + )?)) + } +} + +pub struct RequestFragmentExtractor<'a> { + fragment: &'a JsonTemplate, + extractor_id: u8, + doc_alloc: &'a Bump, +} + +impl<'a> RequestFragmentExtractor<'a> { + pub fn new(fragment: &'a RuntimeFragment, doc_alloc: &'a Bump) -> Self { + Self { fragment: &fragment.template, extractor_id: fragment.id, doc_alloc } + } +} + +impl<'doc> Extractor<'doc> for RequestFragmentExtractor<'doc> { + type DocumentMetadata = (); + type Input = Value; + type Error = json_template::Error; + + fn extractor_id(&self) -> u8 { + self.extractor_id + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + _meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(Some(self.fragment.render_document(doc, self.doc_alloc)?)) + } +} + +pub struct IgnoreErrorExtractor(E); + +impl<'doc, E> Extractor<'doc> for IgnoreErrorExtractor +where + E: Extractor<'doc>, +{ + type DocumentMetadata = E::DocumentMetadata; + type Input = E::Input; + + type Error = Infallible; + + fn extractor_id(&self) -> u8 { + self.0.extractor_id() + } + + fn extract<'a, D: Document<'a> + Debug>( + &self, + doc: D, + meta: &Self::DocumentMetadata, + ) -> Result, Self::Error> { + Ok(self.0.extract(doc, meta).ok().flatten()) + } +} + +#[derive(Debug)] +pub enum Infallible {} + +impl From for crate::Error { + fn from(_: Infallible) -> Self { + unreachable!("Infallible values cannot be built") + } +} diff --git a/crates/milli/src/vector/json_template.rs b/crates/milli/src/vector/json_template/injectable_value.rs similarity index 84% rename from crates/milli/src/vector/json_template.rs rename to crates/milli/src/vector/json_template/injectable_value.rs index 179cbe9af..ec7d900db 100644 --- a/crates/milli/src/vector/json_template.rs +++ b/crates/milli/src/vector/json_template/injectable_value.rs @@ -1,20 +1,17 @@ -//! Module to manipulate JSON templates. +//! Module to manipulate JSON values containing placeholder strings. //! //! This module allows two main operations: -//! 1. Render JSON values from a template and a context value. -//! 2. Retrieve data from a template and JSON values. - -#![warn(rustdoc::broken_intra_doc_links)] -#![warn(missing_docs)] +//! 1. Render JSON values from a template value containing placeholders and a value to inject. +//! 2. Extract data from a template value containing placeholders and a concrete JSON value that fits the template value. use serde::Deserialize; use serde_json::{Map, Value}; -type ValuePath = Vec; +use super::{format_value, inject_value, path_with_root, PathComponent, ValuePath}; /// Encapsulates a JSON template and allows injecting and extracting values from it. #[derive(Debug)] -pub struct ValueTemplate { +pub struct InjectableValue { template: Value, value_kind: ValueKind, } @@ -32,34 +29,13 @@ struct ArrayPath { value_path_in_array: ValuePath, } -/// Component of a path to a Value -#[derive(Debug, Clone)] -pub enum PathComponent { - /// A key inside of an object - MapKey(String), - /// An index inside of an array - ArrayIndex(usize), -} - -impl PartialEq for PathComponent { - fn eq(&self, other: &Self) -> bool { - match (self, other) { - (Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0, - (Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0, - _ => false, - } - } -} - -impl Eq for PathComponent {} - -/// Error that occurs when no few value was provided to a template for injection. +/// Error that occurs when no value was provided to a template for injection. #[derive(Debug)] pub struct MissingValue; -/// Error that occurs when trying to parse a template in [`ValueTemplate::new`] +/// Error that occurs when trying to parse a template in [`InjectableValue::new`] #[derive(Debug)] -pub enum TemplateParsingError { +pub enum InjectableParsingError { /// A repeat string appears inside a repeated value NestedRepeatString(ValuePath), /// A repeat string appears outside of an array @@ -85,42 +61,42 @@ pub enum TemplateParsingError { }, } -impl TemplateParsingError { +impl InjectableParsingError { /// Produce an error message from the error kind, the name of the root object, the placeholder string and the repeat string pub fn error_message(&self, root: &str, placeholder: &str, repeat: &str) -> String { match self { - TemplateParsingError::NestedRepeatString(path) => { + InjectableParsingError::NestedRepeatString(path) => { format!( r#"in {}: "{repeat}" appears nested inside of a value that is itself repeated"#, path_with_root(root, path) ) } - TemplateParsingError::RepeatStringNotInArray(path) => format!( + InjectableParsingError::RepeatStringNotInArray(path) => format!( r#"in {}: "{repeat}" appears outside of an array"#, path_with_root(root, path) ), - TemplateParsingError::BadIndexForRepeatString(path, index) => format!( + InjectableParsingError::BadIndexForRepeatString(path, index) => format!( r#"in {}: "{repeat}" expected at position #1, but found at position #{index}"#, path_with_root(root, path) ), - TemplateParsingError::MissingPlaceholderInRepeatedValue(path) => format!( + InjectableParsingError::MissingPlaceholderInRepeatedValue(path) => format!( r#"in {}: Expected "{placeholder}" inside of the repeated value"#, path_with_root(root, path) ), - TemplateParsingError::MultipleRepeatString(current, previous) => format!( + InjectableParsingError::MultipleRepeatString(current, previous) => format!( r#"in {}: Found "{repeat}", but it was already present in {}"#, path_with_root(root, current), path_with_root(root, previous) ), - TemplateParsingError::MultiplePlaceholderString(current, previous) => format!( + InjectableParsingError::MultiplePlaceholderString(current, previous) => format!( r#"in {}: Found "{placeholder}", but it was already present in {}"#, path_with_root(root, current), path_with_root(root, previous) ), - TemplateParsingError::MissingPlaceholderString => { + InjectableParsingError::MissingPlaceholderString => { format!(r#"in `{root}`: "{placeholder}" not found"#) } - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path, path_to_array, array_to_placeholder, @@ -140,41 +116,41 @@ impl TemplateParsingError { fn prepend_path(self, mut prepended_path: ValuePath) -> Self { match self { - TemplateParsingError::NestedRepeatString(mut path) => { + InjectableParsingError::NestedRepeatString(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::NestedRepeatString(prepended_path) + InjectableParsingError::NestedRepeatString(prepended_path) } - TemplateParsingError::RepeatStringNotInArray(mut path) => { + InjectableParsingError::RepeatStringNotInArray(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::RepeatStringNotInArray(prepended_path) + InjectableParsingError::RepeatStringNotInArray(prepended_path) } - TemplateParsingError::BadIndexForRepeatString(mut path, index) => { + InjectableParsingError::BadIndexForRepeatString(mut path, index) => { prepended_path.append(&mut path); - TemplateParsingError::BadIndexForRepeatString(prepended_path, index) + InjectableParsingError::BadIndexForRepeatString(prepended_path, index) } - TemplateParsingError::MissingPlaceholderInRepeatedValue(mut path) => { + InjectableParsingError::MissingPlaceholderInRepeatedValue(mut path) => { prepended_path.append(&mut path); - TemplateParsingError::MissingPlaceholderInRepeatedValue(prepended_path) + InjectableParsingError::MissingPlaceholderInRepeatedValue(prepended_path) } - TemplateParsingError::MultipleRepeatString(mut path, older_path) => { + InjectableParsingError::MultipleRepeatString(mut path, older_path) => { let older_prepended_path = prepended_path.iter().cloned().chain(older_path).collect(); prepended_path.append(&mut path); - TemplateParsingError::MultipleRepeatString(prepended_path, older_prepended_path) + InjectableParsingError::MultipleRepeatString(prepended_path, older_prepended_path) } - TemplateParsingError::MultiplePlaceholderString(mut path, older_path) => { + InjectableParsingError::MultiplePlaceholderString(mut path, older_path) => { let older_prepended_path = prepended_path.iter().cloned().chain(older_path).collect(); prepended_path.append(&mut path); - TemplateParsingError::MultiplePlaceholderString( + InjectableParsingError::MultiplePlaceholderString( prepended_path, older_prepended_path, ) } - TemplateParsingError::MissingPlaceholderString => { - TemplateParsingError::MissingPlaceholderString + InjectableParsingError::MissingPlaceholderString => { + InjectableParsingError::MissingPlaceholderString } - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path, mut path_to_array, array_to_placeholder, @@ -184,7 +160,7 @@ impl TemplateParsingError { prepended_path.iter().cloned().chain(single_path).collect(); prepended_path.append(&mut path_to_array); // we don't prepend the array_to_placeholder path as it is the array path that is prepended - TemplateParsingError::BothArrayAndSingle { + InjectableParsingError::BothArrayAndSingle { single_path: single_prepended_path, path_to_array: prepended_path, array_to_placeholder, @@ -194,7 +170,7 @@ impl TemplateParsingError { } } -/// Error that occurs when [`ValueTemplate::extract`] fails. +/// Error that occurs when [`InjectableValue::extract`] fails. #[derive(Debug)] pub struct ExtractionError { /// The cause of the failure @@ -336,27 +312,6 @@ enum LastNamedObject<'a> { NestedArrayInsideObject { object_name: &'a str, index: usize, nesting_level: usize }, } -/// Builds a string representation of a path, preprending the name of the root value. -pub fn path_with_root<'a>( - root: &str, - path: impl IntoIterator + 'a, -) -> String { - use std::fmt::Write as _; - let mut res = format!("`{root}"); - for component in path.into_iter() { - match component { - PathComponent::MapKey(key) => { - let _ = write!(&mut res, ".{key}"); - } - PathComponent::ArrayIndex(index) => { - let _ = write!(&mut res, "[{index}]"); - } - } - } - res.push('`'); - res -} - /// Context where an extraction failure happened /// /// The operation that failed @@ -405,7 +360,7 @@ enum ArrayParsingContext<'a> { NotNested(&'a mut Option), } -impl ValueTemplate { +impl InjectableValue { /// Prepare a template for injection or extraction. /// /// # Parameters @@ -419,12 +374,12 @@ impl ValueTemplate { /// /// # Errors /// - /// - [`TemplateParsingError`]: refer to the documentation of this type + /// - [`InjectableParsingError`]: refer to the documentation of this type pub fn new( template: Value, placeholder_string: &str, repeat_string: &str, - ) -> Result { + ) -> Result { let mut value_path = None; let mut array_path = None; let mut current_path = Vec::new(); @@ -438,11 +393,11 @@ impl ValueTemplate { )?; let value_kind = match (array_path, value_path) { - (None, None) => return Err(TemplateParsingError::MissingPlaceholderString), + (None, None) => return Err(InjectableParsingError::MissingPlaceholderString), (None, Some(value_path)) => ValueKind::Single(value_path), (Some(array_path), None) => ValueKind::Array(array_path), (Some(array_path), Some(value_path)) => { - return Err(TemplateParsingError::BothArrayAndSingle { + return Err(InjectableParsingError::BothArrayAndSingle { single_path: value_path, path_to_array: array_path.path_to_array, array_to_placeholder: array_path.value_path_in_array, @@ -564,29 +519,29 @@ impl ValueTemplate { value_path: &mut Option, mut array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { // two modes for parsing array. match array { // 1. array contains a repeat string in second position [first, second, rest @ ..] if second == repeat_string => { let ArrayParsingContext::NotNested(array_path) = &mut array_path else { - return Err(TemplateParsingError::NestedRepeatString(current_path.clone())); + return Err(InjectableParsingError::NestedRepeatString(current_path.clone())); }; if let Some(array_path) = array_path { - return Err(TemplateParsingError::MultipleRepeatString( + return Err(InjectableParsingError::MultipleRepeatString( current_path.clone(), array_path.path_to_array.clone(), )); } if first == repeat_string { - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), 0, )); } if let Some(position) = rest.iter().position(|value| value == repeat_string) { let position = position + 2; - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), position, )); @@ -609,7 +564,9 @@ impl ValueTemplate { value_path.ok_or_else(|| { let mut repeated_value_path = current_path.clone(); repeated_value_path.push(PathComponent::ArrayIndex(0)); - TemplateParsingError::MissingPlaceholderInRepeatedValue(repeated_value_path) + InjectableParsingError::MissingPlaceholderInRepeatedValue( + repeated_value_path, + ) })? }; **array_path = Some(ArrayPath { @@ -621,7 +578,7 @@ impl ValueTemplate { // 2. array does not contain a repeat string array => { if let Some(position) = array.iter().position(|value| value == repeat_string) { - return Err(TemplateParsingError::BadIndexForRepeatString( + return Err(InjectableParsingError::BadIndexForRepeatString( current_path.clone(), position, )); @@ -650,7 +607,7 @@ impl ValueTemplate { value_path: &mut Option, array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { for (key, value) in object.iter() { current_path.push(PathComponent::MapKey(key.to_owned())); Self::parse_value( @@ -673,12 +630,12 @@ impl ValueTemplate { value_path: &mut Option, array_path: &mut ArrayParsingContext, current_path: &mut ValuePath, - ) -> Result<(), TemplateParsingError> { + ) -> Result<(), InjectableParsingError> { match value { Value::String(str) => { if placeholder_string == str { if let Some(value_path) = value_path { - return Err(TemplateParsingError::MultiplePlaceholderString( + return Err(InjectableParsingError::MultiplePlaceholderString( current_path.clone(), value_path.clone(), )); @@ -687,7 +644,9 @@ impl ValueTemplate { *value_path = Some(current_path.clone()); } if repeat_string == str { - return Err(TemplateParsingError::RepeatStringNotInArray(current_path.clone())); + return Err(InjectableParsingError::RepeatStringNotInArray( + current_path.clone(), + )); } } Value::Null | Value::Bool(_) | Value::Number(_) => {} @@ -712,27 +671,6 @@ impl ValueTemplate { } } -fn inject_value(rendered: &mut Value, injection_path: &Vec, injected_value: Value) { - let mut current_value = rendered; - for injection_component in injection_path { - current_value = match injection_component { - PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(), - PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(), - } - } - *current_value = injected_value; -} - -fn format_value(value: &Value) -> String { - match value { - Value::Array(array) => format!("an array of size {}", array.len()), - Value::Object(object) => { - format!("an object with {} field(s)", object.len()) - } - value => value.to_string(), - } -} - fn extract_value( extraction_path: &[PathComponent], initial_value: &mut Value, @@ -838,10 +776,10 @@ impl ExtractionResultErrorContext for Result { mod test { use serde_json::{json, Value}; - use super::{PathComponent, TemplateParsingError, ValueTemplate}; + use super::{InjectableParsingError, InjectableValue, PathComponent}; - fn new_template(template: Value) -> Result { - ValueTemplate::new(template, "{{text}}", "{{..}}") + fn new_template(template: Value) -> Result { + InjectableValue::new(template, "{{text}}", "{{..}}") } #[test] @@ -853,7 +791,7 @@ mod test { }); let error = new_template(template.clone()).unwrap_err(); - assert!(matches!(error, TemplateParsingError::MissingPlaceholderString)) + assert!(matches!(error, InjectableParsingError::MissingPlaceholderString)) } #[test] @@ -887,7 +825,7 @@ mod test { }); match new_template(template.clone()) { - Err(TemplateParsingError::MultiplePlaceholderString(left, right)) => { + Err(InjectableParsingError::MultiplePlaceholderString(left, right)) => { assert_eq!( left, vec![PathComponent::MapKey("titi".into()), PathComponent::ArrayIndex(3)] diff --git a/crates/milli/src/vector/json_template/mod.rs b/crates/milli/src/vector/json_template/mod.rs new file mode 100644 index 000000000..d7ce3e8f1 --- /dev/null +++ b/crates/milli/src/vector/json_template/mod.rs @@ -0,0 +1,282 @@ +//! Exposes types to manipulate JSON values +//! +//! - [`JsonTemplate`]: renders JSON values by rendering its strings as [`Template`]s. +//! - [`InjectableValue`]: Describes a JSON value containing placeholders, +//! then allows to inject values instead of the placeholder to produce new concrete JSON values, +//! or extract sub-values at the placeholder location from concrete JSON values. +//! +//! The module also exposes foundational types to work with JSON paths: +//! +//! - [`ValuePath`] is made of [`PathComponent`]s to indicate the location of a sub-value inside of a JSON value. +//! - [`inject_value`] is a primitive that replaces the sub-value at the described location by an injected value. + +#![warn(rustdoc::broken_intra_doc_links)] +#![warn(missing_docs)] + +use bumpalo::Bump; +use liquid::{Parser, Template}; +use serde_json::{Map, Value}; + +use crate::prompt::ParseableDocument; +use crate::update::new::document::Document; + +mod injectable_value; + +pub use injectable_value::InjectableValue; + +/// Represents a JSON [`Value`] where each string is rendered as a [`Template`]. +#[derive(Debug)] +pub struct JsonTemplate { + value: Value, + templates: Vec, +} + +impl Clone for JsonTemplate { + fn clone(&self) -> Self { + Self::new(self.value.clone()).unwrap() + } +} + +struct TemplateAtPath { + template: Template, + path: ValuePath, +} + +impl std::fmt::Debug for TemplateAtPath { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TemplateAtPath") + .field("template", &&"template") + .field("path", &self.path) + .finish() + } +} + +/// Error that can occur either when parsing the templates in the value, or when trying to render them. +#[derive(Debug)] +pub struct Error { + template_error: liquid::Error, + path: ValuePath, +} + +impl Error { + /// Produces an error message when the error happened at rendering time. + pub fn rendering_error(&self, root: &str) -> String { + format!( + "in `{}`, error while rendering template: {}", + path_with_root(root, self.path.iter()), + &self.template_error + ) + } + + /// Produces an error message when the error happened at parsing time. + pub fn parsing(&self, root: &str) -> String { + format!( + "in `{}`, error while parsing template: {}", + path_with_root(root, self.path.iter()), + &self.template_error + ) + } +} + +impl JsonTemplate { + /// Creates a new `JsonTemplate` by parsing all strings inside the value as templates. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be parsed. + pub fn new(value: Value) -> Result { + let templates = build_templates(&value)?; + Ok(Self { value, templates }) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the given context. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered with the given context. + pub fn render(&self, context: &dyn liquid::ObjectView) -> Result { + let mut rendered = self.value.clone(); + for TemplateAtPath { template, path } in &self.templates { + let injected_value = + template.render(context).map_err(|err| error_with_path(err, path.clone()))?; + inject_value(&mut rendered, path, Value::String(injected_value)); + } + Ok(rendered) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the given document. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered with the given document. + pub fn render_document<'a, 'doc, D: Document<'a> + std::fmt::Debug>( + &self, + document: D, + doc_alloc: &'doc Bump, + ) -> Result { + let document = ParseableDocument::new(document, doc_alloc); + let context = crate::prompt::Context::without_fields(&document); + self.render(&context) + } + + /// Renders this value by replacing all its strings with the rendered version of the template they represent from the contents of the search query. + /// + /// # Error + /// + /// - If any of the strings contains a template that cannot be rendered from the contents of the search query + pub fn render_search(&self, q: Option<&str>, media: Option<&Value>) -> Result { + let search_data = match (q, media) { + (None, None) => liquid::object!({}), + (None, Some(media)) => liquid::object!({ "media": media }), + (Some(q), None) => liquid::object!({"q": q}), + (Some(q), Some(media)) => liquid::object!({"q": q, "media": media}), + }; + self.render(&search_data) + } + + /// The JSON value representing the underlying template + pub fn template(&self) -> &Value { + &self.value + } +} + +fn build_templates(value: &Value) -> Result, Error> { + let mut current_path = ValuePath::new(); + let mut templates = Vec::new(); + let compiler = liquid::ParserBuilder::with_stdlib().build().unwrap(); + parse_value(value, &mut current_path, &mut templates, &compiler)?; + Ok(templates) +} + +fn error_with_path(template_error: liquid::Error, path: ValuePath) -> Error { + Error { template_error, path } +} + +fn parse_value( + value: &Value, + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + match value { + Value::String(template) => { + let template = compiler + .parse(template) + .map_err(|err| error_with_path(err, current_path.clone()))?; + templates.push(TemplateAtPath { template, path: current_path.clone() }); + } + Value::Array(values) => { + parse_array(values, current_path, templates, compiler)?; + } + Value::Object(map) => { + parse_object(map, current_path, templates, compiler)?; + } + _ => {} + } + Ok(()) +} + +fn parse_object( + map: &Map, + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + for (key, value) in map { + current_path.push(PathComponent::MapKey(key.clone())); + parse_value(value, current_path, templates, compiler)?; + current_path.pop(); + } + Ok(()) +} + +fn parse_array( + values: &[Value], + current_path: &mut ValuePath, + templates: &mut Vec, + compiler: &Parser, +) -> Result<(), Error> { + for (index, value) in values.iter().enumerate() { + current_path.push(PathComponent::ArrayIndex(index)); + parse_value(value, current_path, templates, compiler)?; + current_path.pop(); + } + Ok(()) +} + +/// A list of [`PathComponent`]s describing a path to a value inside a JSON value. +/// +/// The empty list refers to the root value. +pub type ValuePath = Vec; + +/// Component of a path to a Value +#[derive(Debug, Clone)] +pub enum PathComponent { + /// A key inside of an object + MapKey(String), + /// An index inside of an array + ArrayIndex(usize), +} + +impl PartialEq for PathComponent { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + (Self::MapKey(l0), Self::MapKey(r0)) => l0 == r0, + (Self::ArrayIndex(l0), Self::ArrayIndex(r0)) => l0 == r0, + _ => false, + } + } +} + +impl Eq for PathComponent {} + +/// Builds a string representation of a path, preprending the name of the root value. +pub fn path_with_root<'a>( + root: &str, + path: impl IntoIterator + 'a, +) -> String { + use std::fmt::Write as _; + let mut res = format!("`{root}"); + for component in path.into_iter() { + match component { + PathComponent::MapKey(key) => { + let _ = write!(&mut res, ".{key}"); + } + PathComponent::ArrayIndex(index) => { + let _ = write!(&mut res, "[{index}]"); + } + } + } + res.push('`'); + res +} + +/// Modifies `rendered` to replace the sub-value at the `injection_path` location by the `injected_value`. +/// +/// # Panics +/// +/// - if the provided `injection_path` cannot be traversed in `rendered`. +pub fn inject_value( + rendered: &mut Value, + injection_path: &Vec, + injected_value: Value, +) { + let mut current_value = rendered; + for injection_component in injection_path { + current_value = match injection_component { + PathComponent::MapKey(key) => current_value.get_mut(key).unwrap(), + PathComponent::ArrayIndex(index) => current_value.get_mut(index).unwrap(), + } + } + *current_value = injected_value; +} + +fn format_value(value: &Value) -> String { + match value { + Value::Array(array) => format!("an array of size {}", array.len()), + Value::Object(object) => { + format!("an object with {} field(s)", object.len()) + } + value => value.to_string(), + } +} diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 065beb5fb..f64223e41 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -15,15 +15,20 @@ use utoipa::ToSchema; use self::error::{EmbedError, NewEmbedderError}; use crate::progress::{EmbedderStats, Progress}; use crate::prompt::{Prompt, PromptData}; +use crate::vector::composite::SubEmbedderOptions; +use crate::vector::json_template::JsonTemplate; use crate::ThreadPoolNoAbort; pub mod composite; +pub mod db; pub mod error; +pub mod extractor; pub mod hf; pub mod json_template; pub mod manual; pub mod openai; pub mod parsed_vectors; +pub mod session; pub mod settings; pub mod ollama; @@ -60,7 +65,7 @@ impl ArroyWrapper { rtxn: &'a RoTxn<'a>, db: arroy::Database, ) -> impl Iterator, arroy::Error>> + 'a { - arroy_db_range_for_embedder(self.embedder_index).map_while(move |index| { + arroy_store_range_for_embedder(self.embedder_index).filter_map(move |index| { match arroy::Reader::open(rtxn, index, db) { Ok(reader) => match reader.is_empty(rtxn) { Ok(false) => Some(Ok(reader)), @@ -73,12 +78,57 @@ impl ArroyWrapper { }) } - pub fn dimensions(&self, rtxn: &RoTxn) -> Result { - let first_id = arroy_db_range_for_embedder(self.embedder_index).next().unwrap(); + /// The item ids that are present in the store specified by its id. + /// + /// The ids are accessed via a lambda to avoid lifetime shenanigans. + pub fn items_in_store( + &self, + rtxn: &RoTxn, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { if self.quantized { - Ok(arroy::Reader::open(rtxn, first_id, self.quantized_db())?.dimensions()) + self._items_in_store(rtxn, self.quantized_db(), store_id, with_items) } else { - Ok(arroy::Reader::open(rtxn, first_id, self.angular_db())?.dimensions()) + self._items_in_store(rtxn, self.angular_db(), store_id, with_items) + } + } + + fn _items_in_store( + &self, + rtxn: &RoTxn, + db: arroy::Database, + store_id: u8, + with_items: F, + ) -> Result + where + F: FnOnce(&RoaringBitmap) -> O, + { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let reader = arroy::Reader::open(rtxn, index, db); + match reader { + Ok(reader) => Ok(with_items(reader.item_ids())), + Err(arroy::Error::MissingMetadata(_)) => Ok(with_items(&RoaringBitmap::new())), + Err(err) => Err(err), + } + } + + pub fn dimensions(&self, rtxn: &RoTxn) -> Result, arroy::Error> { + if self.quantized { + Ok(self + .readers(rtxn, self.quantized_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) + } else { + Ok(self + .readers(rtxn, self.angular_db()) + .next() + .transpose()? + .map(|reader| reader.dimensions())) } } @@ -93,13 +143,13 @@ impl ArroyWrapper { arroy_memory: Option, cancel: &(impl Fn() -> bool + Sync + Send), ) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.need_build(wtxn)? { writer.builder(rng).build(wtxn)? } else if writer.is_empty(wtxn)? { - break; + continue; } } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); @@ -124,7 +174,7 @@ impl ArroyWrapper { .cancel(cancel) .build(wtxn)?; } else if writer.is_empty(wtxn)? { - break; + continue; } } } @@ -143,7 +193,7 @@ impl ArroyWrapper { ) -> Result<(), arroy::Error> { let dimension = embeddings.dimension(); for (index, vector) in - arroy_db_range_for_embedder(self.embedder_index).zip(embeddings.iter()) + arroy_store_range_for_embedder(self.embedder_index).zip(embeddings.iter()) { if self.quantized { arroy::Writer::new(self.quantized_db(), index, dimension) @@ -179,7 +229,7 @@ impl ArroyWrapper { ) -> Result<(), arroy::Error> { let dimension = vector.len(); - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); if !writer.contains_item(wtxn, item_id)? { writer.add_item(wtxn, item_id, vector)?; @@ -189,6 +239,38 @@ impl ArroyWrapper { Ok(()) } + /// Add a vector associated with a document in store specified by its id. + /// + /// Any existing vector associated with the document in the store will be replaced by the new vector. + pub fn add_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), arroy::Error> { + if self.quantized { + self._add_item_in_store(wtxn, self.quantized_db(), item_id, store_id, vector) + } else { + self._add_item_in_store(wtxn, self.angular_db(), item_id, store_id, vector) + } + } + + fn _add_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + vector: &[f32], + ) -> Result<(), arroy::Error> { + let dimension = vector.len(); + + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimension); + writer.add_item(wtxn, item_id, vector) + } + /// Delete all embeddings from a specific `item_id` pub fn del_items( &self, @@ -196,24 +278,84 @@ impl ArroyWrapper { dimension: usize, item_id: arroy::ItemId, ) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); - if !writer.del_item(wtxn, item_id)? { - break; - } + writer.del_item(wtxn, item_id)?; } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); - if !writer.del_item(wtxn, item_id)? { - break; - } + writer.del_item(wtxn, item_id)?; } } Ok(()) } - /// Delete one item. + /// Removes the item specified by its id from the store specified by its id. + /// + /// Returns whether the item was removed. + /// + /// # Warning + /// + /// - This function will silently fail to remove the item if used against an arroy database that was never built. + pub fn del_item_in_store( + &self, + wtxn: &mut RwTxn, + item_id: arroy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + if self.quantized { + self._del_item_in_store(wtxn, self.quantized_db(), item_id, store_id, dimensions) + } else { + self._del_item_in_store(wtxn, self.angular_db(), item_id, store_id, dimensions) + } + } + + fn _del_item_in_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + item_id: arroy::ItemId, + store_id: u8, + dimensions: usize, + ) -> Result { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.del_item(wtxn, item_id) + } + + /// Removes all items from the store specified by its id. + /// + /// # Warning + /// + /// - This function will silently fail to remove the items if used against an arroy database that was never built. + pub fn clear_store( + &self, + wtxn: &mut RwTxn, + store_id: u8, + dimensions: usize, + ) -> Result<(), arroy::Error> { + if self.quantized { + self._clear_store(wtxn, self.quantized_db(), store_id, dimensions) + } else { + self._clear_store(wtxn, self.angular_db(), store_id, dimensions) + } + } + + fn _clear_store( + &self, + wtxn: &mut RwTxn, + db: arroy::Database, + store_id: u8, + dimensions: usize, + ) -> Result<(), arroy::Error> { + let index = arroy_store_for_embedder(self.embedder_index, store_id); + let writer = arroy::Writer::new(db, index, dimensions); + writer.clear(wtxn) + } + + /// Delete one item from its value. pub fn del_item( &self, wtxn: &mut RwTxn, @@ -235,54 +377,31 @@ impl ArroyWrapper { vector: &[f32], ) -> Result { let dimension = vector.len(); - let mut deleted_index = None; - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let writer = arroy::Writer::new(db, index, dimension); let Some(candidate) = writer.item_vector(wtxn, item_id)? else { - // uses invariant: vectors are packed in the first writers. - break; + continue; }; if candidate == vector { - writer.del_item(wtxn, item_id)?; - deleted_index = Some(index); + return writer.del_item(wtxn, item_id); } } - - // 🥲 enforce invariant: vectors are packed in the first writers. - if let Some(deleted_index) = deleted_index { - let mut last_index_with_a_vector = None; - for index in - arroy_db_range_for_embedder(self.embedder_index).skip(deleted_index as usize) - { - let writer = arroy::Writer::new(db, index, dimension); - let Some(candidate) = writer.item_vector(wtxn, item_id)? else { - break; - }; - last_index_with_a_vector = Some((index, candidate)); - } - if let Some((last_index, vector)) = last_index_with_a_vector { - let writer = arroy::Writer::new(db, last_index, dimension); - writer.del_item(wtxn, item_id)?; - let writer = arroy::Writer::new(db, deleted_index, dimension); - writer.add_item(wtxn, item_id, &vector)?; - } - } - Ok(deleted_index.is_some()) + Ok(false) } pub fn clear(&self, wtxn: &mut RwTxn, dimension: usize) -> Result<(), arroy::Error> { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(wtxn)? { - break; + continue; } writer.clear(wtxn)?; } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); if writer.is_empty(wtxn)? { - break; + continue; } writer.clear(wtxn)?; } @@ -296,17 +415,17 @@ impl ArroyWrapper { dimension: usize, item: arroy::ItemId, ) -> Result { - for index in arroy_db_range_for_embedder(self.embedder_index) { + for index in arroy_store_range_for_embedder(self.embedder_index) { let contains = if self.quantized { let writer = arroy::Writer::new(self.quantized_db(), index, dimension); if writer.is_empty(rtxn)? { - break; + continue; } writer.contains_item(rtxn, item)? } else { let writer = arroy::Writer::new(self.angular_db(), index, dimension); if writer.is_empty(rtxn)? { - break; + continue; } writer.contains_item(rtxn, item)? }; @@ -345,13 +464,14 @@ impl ArroyWrapper { let reader = reader?; let mut searcher = reader.nns(limit); if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } searcher.candidates(filter); } if let Some(mut ret) = searcher.by_item(rtxn, item)? { results.append(&mut ret); - } else { - break; } } results.sort_unstable_by_key(|(_, distance)| OrderedFloat(*distance)); @@ -386,6 +506,9 @@ impl ArroyWrapper { let reader = reader?; let mut searcher = reader.nns(limit); if let Some(filter) = filter { + if reader.item_ids().is_disjoint(filter) { + continue; + } searcher.candidates(filter); } @@ -404,16 +527,12 @@ impl ArroyWrapper { for reader in self.readers(rtxn, self.quantized_db()) { if let Some(vec) = reader?.item_vector(rtxn, item_id)? { vectors.push(vec); - } else { - break; } } } else { for reader in self.readers(rtxn, self.angular_db()) { if let Some(vec) = reader?.item_vector(rtxn, item_id)? { vectors.push(vec); - } else { - break; } } } @@ -465,6 +584,7 @@ pub struct ArroyStats { pub documents: RoaringBitmap, } /// One or multiple embeddings stored consecutively in a flat vector. +#[derive(Debug, PartialEq)] pub struct Embeddings { data: Vec, dimension: usize, @@ -615,15 +735,43 @@ impl EmbeddingConfig { } } -/// Map of embedder configurations. -/// -/// Each configuration is mapped to a name. +/// Map of runtime embedder data. #[derive(Clone, Default)] -pub struct EmbeddingConfigs(HashMap, Arc, bool)>); +pub struct RuntimeEmbedders(HashMap>); -impl EmbeddingConfigs { +pub struct RuntimeEmbedder { + pub embedder: Arc, + pub document_template: Prompt, + fragments: Vec, + pub is_quantized: bool, +} + +impl RuntimeEmbedder { + pub fn new( + embedder: Arc, + document_template: Prompt, + mut fragments: Vec, + is_quantized: bool, + ) -> Self { + fragments.sort_unstable_by(|left, right| left.name.cmp(&right.name)); + Self { embedder, document_template, fragments, is_quantized } + } + + /// The runtime fragments sorted by name. + pub fn fragments(&self) -> &[RuntimeFragment] { + self.fragments.as_slice() + } +} + +pub struct RuntimeFragment { + pub name: String, + pub id: u8, + pub template: JsonTemplate, +} + +impl RuntimeEmbedders { /// Create the map from its internal component.s - pub fn new(data: HashMap, Arc, bool)>) -> Self { + pub fn new(data: HashMap>) -> Self { Self(data) } @@ -632,24 +780,31 @@ impl EmbeddingConfigs { } /// Get an embedder configuration and template from its name. - pub fn get(&self, name: &str) -> Option<(Arc, Arc, bool)> { - self.0.get(name).cloned() + pub fn get(&self, name: &str) -> Option<&Arc> { + self.0.get(name) } - pub fn inner_as_ref(&self) -> &HashMap, Arc, bool)> { + pub fn inner_as_ref(&self) -> &HashMap> { &self.0 } - pub fn into_inner(self) -> HashMap, Arc, bool)> { + pub fn into_inner(self) -> HashMap> { self.0 } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } } -impl IntoIterator for EmbeddingConfigs { - type Item = (String, (Arc, Arc, bool)); +impl IntoIterator for RuntimeEmbedders { + type Item = (String, Arc); - type IntoIter = - std::collections::hash_map::IntoIter, Arc, bool)>; + type IntoIter = std::collections::hash_map::IntoIter>; fn into_iter(self) -> Self::IntoIter { self.0.into_iter() @@ -667,6 +822,27 @@ pub enum EmbedderOptions { Composite(composite::EmbedderOptions), } +impl EmbedderOptions { + pub fn fragment(&self, name: &str) -> Option<&serde_json::Value> { + match &self { + EmbedderOptions::HuggingFace(_) + | EmbedderOptions::OpenAi(_) + | EmbedderOptions::Ollama(_) + | EmbedderOptions::UserProvided(_) => None, + EmbedderOptions::Rest(embedder_options) => { + embedder_options.indexing_fragments.get(name) + } + EmbedderOptions::Composite(embedder_options) => { + if let SubEmbedderOptions::Rest(embedder_options) = &embedder_options.index { + embedder_options.indexing_fragments.get(name) + } else { + None + } + } + } + } +} + impl Default for EmbedderOptions { fn default() -> Self { Self::HuggingFace(Default::default()) @@ -707,6 +883,17 @@ impl Embedder { #[tracing::instrument(level = "debug", skip_all, target = "search")] pub fn embed_search( + &self, + query: SearchQuery<'_>, + deadline: Option, + ) -> std::result::Result { + match query { + SearchQuery::Text(text) => self.embed_search_text(text, deadline), + SearchQuery::Media { q, media } => self.embed_search_media(q, media, deadline), + } + } + + pub fn embed_search_text( &self, text: &str, deadline: Option, @@ -728,10 +915,7 @@ impl Embedder { .pop() .ok_or_else(EmbedError::missing_embedding), Embedder::UserProvided(embedder) => embedder.embed_one(text), - Embedder::Rest(embedder) => embedder - .embed_ref(&[text], deadline, None)? - .pop() - .ok_or_else(EmbedError::missing_embedding), + Embedder::Rest(embedder) => embedder.embed_one(SearchQuery::Text(text), deadline, None), Embedder::Composite(embedder) => embedder.search.embed_one(text, deadline, None), }?; @@ -742,6 +926,18 @@ impl Embedder { Ok(embedding) } + pub fn embed_search_media( + &self, + q: Option<&str>, + media: Option<&serde_json::Value>, + deadline: Option, + ) -> std::result::Result { + let Embedder::Rest(embedder) = self else { + return Err(EmbedError::rest_media_not_a_rest()); + }; + embedder.embed_one(SearchQuery::Media { q, media }, deadline, None) + } + /// Embed multiple chunks of texts. /// /// Each chunk is composed of one or multiple texts. @@ -786,6 +982,26 @@ impl Embedder { } } + pub fn embed_index_ref_fragments( + &self, + fragments: &[serde_json::Value], + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError> { + if let Embedder::Rest(embedder) = self { + embedder.embed_index_ref(fragments, threads, embedder_stats) + } else { + let Embedder::Composite(embedder) = self else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + let crate::vector::composite::SubEmbedder::Rest(embedder) = &embedder.index else { + unimplemented!("embedding fragments is only available for rest embedders") + }; + + embedder.embed_index_ref(fragments, threads, embedder_stats) + } + } + /// Indicates the preferred number of chunks to pass to [`Self::embed_chunks`] pub fn chunk_count_hint(&self) -> usize { match self { @@ -857,6 +1073,12 @@ impl Embedder { } } +#[derive(Clone, Copy)] +pub enum SearchQuery<'a> { + Text(&'a str), + Media { q: Option<&'a str>, media: Option<&'a serde_json::Value> }, +} + /// Describes the mean and sigma of distribution of embedding similarity in the embedding space. /// /// The intended use is to make the similarity score more comparable to the regular ranking score. @@ -986,8 +1208,11 @@ pub const fn is_cuda_enabled() -> bool { cfg!(feature = "cuda") } -pub fn arroy_db_range_for_embedder(embedder_id: u8) -> impl Iterator { - let embedder_id = (embedder_id as u16) << 8; - - (0..=u8::MAX).map(move |k| embedder_id | (k as u16)) +fn arroy_store_range_for_embedder(embedder_id: u8) -> impl Iterator { + (0..=u8::MAX).map(move |store_id| arroy_store_for_embedder(embedder_id, store_id)) +} + +fn arroy_store_for_embedder(embedder_id: u8, store_id: u8) -> u16 { + let embedder_id = (embedder_id as u16) << 8; + embedder_id | (store_id as u16) } diff --git a/crates/milli/src/vector/ollama.rs b/crates/milli/src/vector/ollama.rs index d4329a2de..feec92cc0 100644 --- a/crates/milli/src/vector/ollama.rs +++ b/crates/milli/src/vector/ollama.rs @@ -71,6 +71,8 @@ impl EmbedderOptions { request, response, headers: Default::default(), + indexing_fragments: Default::default(), + search_fragments: Default::default(), }) } } diff --git a/crates/milli/src/vector/openai.rs b/crates/milli/src/vector/openai.rs index 0159d5c76..bf6c92978 100644 --- a/crates/milli/src/vector/openai.rs +++ b/crates/milli/src/vector/openai.rs @@ -201,6 +201,8 @@ impl Embedder { ] }), headers: Default::default(), + indexing_fragments: Default::default(), + search_fragments: Default::default(), }, cache_cap, super::rest::ConfigurationSource::OpenAi, diff --git a/crates/milli/src/vector/parsed_vectors.rs b/crates/milli/src/vector/parsed_vectors.rs index 5fcb2912b..b96922bc4 100644 --- a/crates/milli/src/vector/parsed_vectors.rs +++ b/crates/milli/src/vector/parsed_vectors.rs @@ -6,9 +6,8 @@ use serde_json::value::RawValue; use serde_json::{from_slice, Value}; use super::Embedding; -use crate::index::IndexEmbeddingConfig; use crate::update::del_add::{DelAdd, KvReaderDelAdd}; -use crate::{DocumentId, FieldId, InternalError, UserError}; +use crate::{FieldId, InternalError, UserError}; #[derive(serde::Serialize, Debug)] #[serde(untagged)] @@ -151,7 +150,8 @@ impl<'doc> serde::de::Visitor<'doc> for RawVectorsVisitor { regenerate = Some(value); } Ok(Some("embeddings")) => { - let value: &RawValue = match map.next_value() { + let value: &RawValue = match map.next_value::<&RawValue>() { + Ok(value) if value.get() == RawValue::NULL.get() => continue, Ok(value) => value, Err(error) => { return Ok(Err(RawVectorsError::DeserializeEmbeddings { @@ -374,8 +374,7 @@ pub struct ParsedVectorsDiff { impl ParsedVectorsDiff { pub fn new( - docid: DocumentId, - embedders_configs: &[IndexEmbeddingConfig], + regenerate_for_embedders: impl Iterator, documents_diff: &KvReader, old_vectors_fid: Option, new_vectors_fid: Option, @@ -396,10 +395,8 @@ impl ParsedVectorsDiff { } } .flatten().map_or(BTreeMap::default(), |del| del.into_iter().map(|(name, vec)| (name, VectorState::Inline(vec))).collect()); - for embedding_config in embedders_configs { - if embedding_config.user_provided.contains(docid) { - old.entry(embedding_config.name.to_string()).or_insert(VectorState::Manual); - } + for name in regenerate_for_embedders { + old.entry(name).or_insert(VectorState::Generated); } let new = 'new: { diff --git a/crates/milli/src/vector/rest.rs b/crates/milli/src/vector/rest.rs index fbe3c1129..7a16f1a1e 100644 --- a/crates/milli/src/vector/rest.rs +++ b/crates/milli/src/vector/rest.rs @@ -6,11 +6,13 @@ use rand::Rng; use rayon::iter::{IntoParallelIterator as _, ParallelIterator as _}; use rayon::slice::ParallelSlice as _; use serde::{Deserialize, Serialize}; +use serde_json::Value; use super::error::EmbedErrorKind; -use super::json_template::ValueTemplate; +use super::json_template::{InjectableValue, JsonTemplate}; use super::{ - DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, REQUEST_PARALLELISM, + DistributionShift, EmbedError, Embedding, EmbeddingCache, NewEmbedderError, SearchQuery, + REQUEST_PARALLELISM, }; use crate::error::FaultSource; use crate::progress::EmbedderStats; @@ -88,19 +90,61 @@ struct EmbedderData { bearer: Option, headers: BTreeMap, url: String, - request: Request, + request: RequestData, response: Response, configuration_source: ConfigurationSource, } +#[derive(Debug)] +pub enum RequestData { + Single(Request), + FromFragments(RequestFromFragments), +} + +impl RequestData { + pub fn new( + request: Value, + indexing_fragments: BTreeMap, + search_fragments: BTreeMap, + ) -> Result { + Ok(if indexing_fragments.is_empty() && search_fragments.is_empty() { + RequestData::Single(Request::new(request)?) + } else { + for (name, value) in indexing_fragments { + JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing(&format!(".indexingFragments.{name}")), + ) + })?; + } + RequestData::FromFragments(RequestFromFragments::new(request, search_fragments)?) + }) + } + + fn input_type(&self) -> InputType { + match self { + RequestData::Single(request) => request.input_type(), + RequestData::FromFragments(request_from_fragments) => { + request_from_fragments.input_type() + } + } + } + + fn has_fragments(&self) -> bool { + matches!(self, RequestData::FromFragments(_)) + } +} + #[derive(Debug, Clone, PartialEq, Eq, Deserialize, Serialize)] pub struct EmbedderOptions { pub api_key: Option, pub distribution: Option, pub dimensions: Option, pub url: String, - pub request: serde_json::Value, - pub response: serde_json::Value, + pub request: Value, + pub search_fragments: BTreeMap, + pub indexing_fragments: BTreeMap, + pub response: Value, pub headers: BTreeMap, } @@ -138,7 +182,12 @@ impl Embedder { .timeout(std::time::Duration::from_secs(30)) .build(); - let request = Request::new(options.request)?; + let request = RequestData::new( + options.request, + options.indexing_fragments, + options.search_fragments, + )?; + let response = Response::new(options.response, &request)?; let data = EmbedderData { @@ -188,7 +237,7 @@ impl Embedder { embedder_stats: Option<&EmbedderStats>, ) -> Result, EmbedError> where - S: AsRef + Serialize, + S: Serialize, { embed(&self.data, texts, texts.len(), Some(self.dimensions), deadline, embedder_stats) } @@ -231,9 +280,9 @@ impl Embedder { } } - pub(crate) fn embed_index_ref( + pub(crate) fn embed_index_ref( &self, - texts: &[&str], + texts: &[S], threads: &ThreadPoolNoAbort, embedder_stats: &EmbedderStats, ) -> Result, EmbedError> { @@ -287,9 +336,44 @@ impl Embedder { pub(super) fn cache(&self) -> &EmbeddingCache { &self.cache } + + pub(crate) fn embed_one( + &self, + query: SearchQuery, + deadline: Option, + embedder_stats: Option<&EmbedderStats>, + ) -> Result { + let mut embeddings = match (&self.data.request, query) { + (RequestData::Single(_), SearchQuery::Text(text)) => { + embed(&self.data, &[text], 1, Some(self.dimensions), deadline, embedder_stats) + } + (RequestData::Single(_), SearchQuery::Media { q: _, media: _ }) => { + return Err(EmbedError::rest_media_not_a_fragment()) + } + (RequestData::FromFragments(request_from_fragments), SearchQuery::Text(q)) => { + let fragment = request_from_fragments.render_search_fragment(Some(q), None)?; + + embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats) + } + ( + RequestData::FromFragments(request_from_fragments), + SearchQuery::Media { q, media }, + ) => { + let fragment = request_from_fragments.render_search_fragment(q, media)?; + + embed(&self.data, &[fragment], 1, Some(self.dimensions), deadline, embedder_stats) + } + }?; + + // unwrap: checked by `expected_count` + Ok(embeddings.pop().unwrap()) + } } fn infer_dimensions(data: &EmbedderData) -> Result { + if data.request.has_fragments() { + return Err(NewEmbedderError::rest_cannot_infer_dimensions_for_fragment()); + } let v = embed(data, ["test"].as_slice(), 1, None, None, None) .map_err(NewEmbedderError::could_not_determine_dimension)?; // unwrap: guaranteed that v.len() == 1, otherwise the previous line terminated in error @@ -307,6 +391,13 @@ fn embed( where S: Serialize, { + if inputs.is_empty() { + if expected_count != 0 { + return Err(EmbedError::rest_response_embedding_count(expected_count, 0)); + } + return Ok(Vec::new()); + } + let request = data.client.post(&data.url); let request = if let Some(bearer) = &data.bearer { request.set("Authorization", bearer) @@ -318,7 +409,12 @@ where request = request.set(header.as_str(), value.as_str()); } - let body = data.request.inject_texts(inputs); + let body = match &data.request { + RequestData::Single(request) => request.inject_texts(inputs), + RequestData::FromFragments(request_from_fragments) => { + request_from_fragments.request_from_fragments(inputs).expect("inputs was empty") + } + }; for attempt in 0..10 { if let Some(embedder_stats) = &embedder_stats { @@ -426,7 +522,7 @@ fn response_to_embedding( expected_count: usize, expected_dimensions: Option, ) -> Result, Retry> { - let response: serde_json::Value = response + let response: Value = response .into_json() .map_err(EmbedError::rest_response_deserialization) .map_err(Retry::retry_later)?; @@ -455,21 +551,24 @@ fn response_to_embedding( } pub(super) const REQUEST_PLACEHOLDER: &str = "{{text}}"; +pub(super) const REQUEST_FRAGMENT_PLACEHOLDER: &str = "{{fragment}}"; pub(super) const RESPONSE_PLACEHOLDER: &str = "{{embedding}}"; pub(super) const REPEAT_PLACEHOLDER: &str = "{{..}}"; #[derive(Debug)] pub struct Request { - template: ValueTemplate, + template: InjectableValue, } impl Request { - pub fn new(template: serde_json::Value) -> Result { - let template = match ValueTemplate::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) { + pub fn new(template: Value) -> Result { + let template = match InjectableValue::new(template, REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER) + { Ok(template) => template, Err(error) => { let message = error.error_message("request", REQUEST_PLACEHOLDER, REPEAT_PLACEHOLDER); + let message = format!("{message}\n - Note: this template is using a document template, and so expects to contain the placeholder {REQUEST_PLACEHOLDER:?} rather than {REQUEST_FRAGMENT_PLACEHOLDER:?}"); return Err(NewEmbedderError::rest_could_not_parse_template(message)); } }; @@ -485,42 +584,120 @@ impl Request { } } - pub fn inject_texts( - &self, - texts: impl IntoIterator, - ) -> serde_json::Value { + pub fn inject_texts(&self, texts: impl IntoIterator) -> Value { self.template.inject(texts.into_iter().map(|s| serde_json::json!(s))).unwrap() } } #[derive(Debug)] -pub struct Response { - template: ValueTemplate, +pub struct RequestFromFragments { + search_fragments: BTreeMap, + request: InjectableValue, } -impl Response { - pub fn new(template: serde_json::Value, request: &Request) -> Result { - let template = match ValueTemplate::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) - { +impl RequestFromFragments { + pub fn new( + request: Value, + search_fragments: impl IntoIterator, + ) -> Result { + let request = match InjectableValue::new( + request, + REQUEST_FRAGMENT_PLACEHOLDER, + REPEAT_PLACEHOLDER, + ) { Ok(template) => template, Err(error) => { - let message = - error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER); + let message = error.error_message( + "request", + REQUEST_FRAGMENT_PLACEHOLDER, + REPEAT_PLACEHOLDER, + ); + let message = format!("{message}\n - Note: this template is using fragments, and so expects to contain the placeholder {REQUEST_FRAGMENT_PLACEHOLDER:?} rathern than {REQUEST_PLACEHOLDER:?}"); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); } }; - match (template.has_array_value(), request.template.has_array_value()) { + let search_fragments: Result<_, NewEmbedderError> = search_fragments + .into_iter() + .map(|(name, value)| { + let json_template = JsonTemplate::new(value).map_err(|error| { + NewEmbedderError::rest_could_not_parse_template( + error.parsing(&format!(".searchFragments.{name}")), + ) + })?; + Ok((name, json_template)) + }) + .collect(); + + Ok(Self { request, search_fragments: search_fragments? }) + } + + fn input_type(&self) -> InputType { + if self.request.has_array_value() { + InputType::TextArray + } else { + InputType::Text + } + } + + pub fn render_search_fragment( + &self, + q: Option<&str>, + media: Option<&Value>, + ) -> Result { + let mut it = self.search_fragments.iter().filter_map(|(name, template)| { + let render = template.render_search(q, media).ok()?; + Some((name, render)) + }); + let Some((name, fragment)) = it.next() else { + return Err(EmbedError::rest_search_matches_no_fragment(q, media)); + }; + if let Some((second_name, _)) = it.next() { + return Err(EmbedError::rest_search_matches_multiple_fragments( + name, + second_name, + q, + media, + )); + } + + Ok(fragment) + } + + pub fn request_from_fragments<'a, S: Serialize + 'a>( + &self, + fragments: impl IntoIterator, + ) -> Option { + self.request.inject(fragments.into_iter().map(|fragment| serde_json::json!(fragment))).ok() + } +} + +#[derive(Debug)] +pub struct Response { + template: InjectableValue, +} + +impl Response { + pub fn new(template: Value, request: &RequestData) -> Result { + let template = + match InjectableValue::new(template, RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER) { + Ok(template) => template, + Err(error) => { + let message = + error.error_message("response", RESPONSE_PLACEHOLDER, REPEAT_PLACEHOLDER); + return Err(NewEmbedderError::rest_could_not_parse_template(message)); + } + }; + + match (template.has_array_value(), request.input_type() == InputType::TextArray) { (true, true) | (false, false) => Ok(Self {template}), (true, false) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has multiple embeddings, but `request` has only one text to embed".to_string())), (false, true) => Err(NewEmbedderError::rest_could_not_parse_template("in `response`: `response` has a single embedding, but `request` has multiple texts to embed".to_string())), } } - pub fn extract_embeddings( - &self, - response: serde_json::Value, - ) -> Result, EmbedError> { + pub fn extract_embeddings(&self, response: Value) -> Result, EmbedError> { let extracted_values: Vec = match self.template.extract(response) { Ok(extracted_values) => extracted_values, Err(error) => { diff --git a/crates/milli/src/vector/session.rs b/crates/milli/src/vector/session.rs new file mode 100644 index 000000000..b582bd840 --- /dev/null +++ b/crates/milli/src/vector/session.rs @@ -0,0 +1,177 @@ +use bumpalo::collections::Vec as BVec; +use bumpalo::Bump; +use serde_json::Value; + +use super::{EmbedError, Embedder, Embedding}; +use crate::progress::EmbedderStats; +use crate::{DocumentId, Result, ThreadPoolNoAbort}; +type ExtractorId = u8; + +#[derive(Clone, Copy)] +pub struct Metadata<'doc> { + pub docid: DocumentId, + pub external_docid: &'doc str, + pub extractor_id: ExtractorId, +} + +pub struct EmbeddingResponse<'doc> { + pub metadata: Metadata<'doc>, + pub embedding: Option, +} + +pub trait OnEmbed<'doc> { + type ErrorMetadata; + + fn process_embedding_response(&mut self, response: EmbeddingResponse<'doc>); + fn process_embedding_error( + &mut self, + error: EmbedError, + embedder_name: &'doc str, + unused_vectors_distribution: &Self::ErrorMetadata, + metadata: BVec<'doc, Metadata<'doc>>, + ) -> crate::Error; +} + +pub struct EmbedSession<'doc, C, I> { + // requests + inputs: BVec<'doc, I>, + metadata: BVec<'doc, Metadata<'doc>>, + + threads: &'doc ThreadPoolNoAbort, + embedder: &'doc Embedder, + + embedder_name: &'doc str, + + embedder_stats: &'doc EmbedderStats, + + on_embed: C, +} + +pub trait Input: Sized { + fn embed_ref( + inputs: &[Self], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError>; +} + +impl Input for &'_ str { + fn embed_ref( + inputs: &[Self], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError> { + embedder.embed_index_ref(inputs, threads, embedder_stats) + } +} + +impl Input for Value { + fn embed_ref( + inputs: &[Value], + embedder: &Embedder, + threads: &ThreadPoolNoAbort, + embedder_stats: &EmbedderStats, + ) -> std::result::Result, EmbedError> { + embedder.embed_index_ref_fragments(inputs, threads, embedder_stats) + } +} + +impl<'doc, C: OnEmbed<'doc>, I: Input> EmbedSession<'doc, C, I> { + #[allow(clippy::too_many_arguments)] + pub fn new( + embedder: &'doc Embedder, + embedder_name: &'doc str, + threads: &'doc ThreadPoolNoAbort, + doc_alloc: &'doc Bump, + embedder_stats: &'doc EmbedderStats, + on_embed: C, + ) -> Self { + let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); + let texts = BVec::with_capacity_in(capacity, doc_alloc); + let ids = BVec::with_capacity_in(capacity, doc_alloc); + Self { + inputs: texts, + metadata: ids, + embedder, + threads, + embedder_name, + embedder_stats, + on_embed, + } + } + + pub fn request_embedding( + &mut self, + metadata: Metadata<'doc>, + rendered: I, + unused_vectors_distribution: &C::ErrorMetadata, + ) -> Result<()> { + if self.inputs.len() < self.inputs.capacity() { + self.inputs.push(rendered); + self.metadata.push(metadata); + return Ok(()); + } + + self.embed_chunks(unused_vectors_distribution) + } + + pub fn drain(mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result { + self.embed_chunks(unused_vectors_distribution)?; + Ok(self.on_embed) + } + + #[allow(clippy::too_many_arguments)] + fn embed_chunks(&mut self, unused_vectors_distribution: &C::ErrorMetadata) -> Result<()> { + if self.inputs.is_empty() { + return Ok(()); + } + let res = match I::embed_ref( + self.inputs.as_slice(), + self.embedder, + self.threads, + self.embedder_stats, + ) { + Ok(embeddings) => { + for (metadata, embedding) in self.metadata.iter().copied().zip(embeddings) { + self.on_embed.process_embedding_response(EmbeddingResponse { + metadata, + embedding: Some(embedding), + }); + } + Ok(()) + } + Err(error) => { + // reset metadata and inputs, and send metadata to the error processing. + let doc_alloc = self.metadata.bump(); + let metadata = std::mem::replace( + &mut self.metadata, + BVec::with_capacity_in(self.inputs.capacity(), doc_alloc), + ); + self.inputs.clear(); + return Err(self.on_embed.process_embedding_error( + error, + self.embedder_name, + unused_vectors_distribution, + metadata, + )); + } + }; + self.inputs.clear(); + self.metadata.clear(); + res + } + + pub(crate) fn embedder_name(&self) -> &'doc str { + self.embedder_name + } + + pub(crate) fn doc_alloc(&self) -> &'doc Bump { + self.inputs.bump() + } + + pub(crate) fn on_embed_mut(&mut self) -> &mut C { + &mut self.on_embed + } +} diff --git a/crates/milli/src/vector/settings.rs b/crates/milli/src/vector/settings.rs index 712c1faa5..1b85dd503 100644 --- a/crates/milli/src/vector/settings.rs +++ b/crates/milli/src/vector/settings.rs @@ -2,6 +2,8 @@ use std::collections::BTreeMap; use std::num::NonZeroUsize; use deserr::Deserr; +use either::Either; +use itertools::Itertools; use roaring::RoaringBitmap; use serde::{Deserialize, Serialize}; use utoipa::ToSchema; @@ -229,6 +231,35 @@ pub struct EmbeddingSettings { /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated pub url: Setting, + /// Template fragments that will be reassembled and sent to the remote embedder at indexing time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents. + /// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes. + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub indexing_fragments: Setting>>, + + /// Template fragments that will be reassembled and sent to the remote embedder at search time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub search_fragments: Setting>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] #[schema(value_type = Option)] @@ -483,6 +514,36 @@ pub struct SubEmbeddingSettings { /// - 🌱 When modified for source `openAi`, embeddings are never regenerated /// - 🏗️ When modified for sources `ollama` and `rest`, embeddings are always regenerated pub url: Setting, + + /// Template fragments that will be reassembled and sent to the remote embedder at indexing time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🏗️ When a fragment is deleted by passing `null` to its name, the corresponding embeddings are removed from documents. + /// - 🏗️ When a fragment is modified, the corresponding embeddings are regenerated if their rendered version changes. + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub indexing_fragments: Setting>>, + + /// Template fragments that will be reassembled and sent to the remote embedder at search time. + /// + /// # Availability + /// + /// - This parameter is available for sources `rest`. + /// + /// # 🔄 Reindexing + /// + /// - 🌱 Changing the value of this parameter never regenerates embeddings + #[serde(default, skip_serializing_if = "Setting::is_not_set")] + #[deserr(default)] + #[schema(value_type = Option>)] + pub search_fragments: Setting>>, + #[serde(default, skip_serializing_if = "Setting::is_not_set")] #[deserr(default)] #[schema(value_type = Option)] @@ -554,17 +615,31 @@ pub struct SubEmbeddingSettings { pub indexing_embedder: Setting, } +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub enum EmbeddingValidationContext { + FullSettings, + SettingsPartialUpdate, +} + /// Indicates what action should take place during a reindexing operation for an embedder -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash)] pub enum ReindexAction { /// An indexing operation should take place for this embedder, keeping existing vectors /// and checking whether the document template changed or not RegeneratePrompts, + RegenerateFragments(Vec<(String, RegenerateFragment)>), /// An indexing operation should take place for all documents for this embedder, removing existing vectors /// (except userProvided ones) FullReindex, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] +pub enum RegenerateFragment { + Update, + Remove, + Add, +} + pub enum SettingsDiff { Remove, Reindex { action: ReindexAction, updated_settings: EmbeddingSettings, quantize: bool }, @@ -577,6 +652,12 @@ pub struct EmbedderAction { pub is_being_quantized: bool, pub write_back: Option, pub reindex: Option, + pub remove_fragments: Option, +} + +#[derive(Debug)] +pub struct RemoveFragments { + pub fragment_ids: Vec, } impl EmbedderAction { @@ -592,6 +673,10 @@ impl EmbedderAction { self.reindex.as_ref() } + pub fn remove_fragments(&self) -> Option<&RemoveFragments> { + self.remove_fragments.as_ref() + } + pub fn with_is_being_quantized(mut self, quantize: bool) -> Self { self.is_being_quantized = quantize; self @@ -603,11 +688,23 @@ impl EmbedderAction { is_being_quantized: false, write_back: Some(write_back), reindex: None, + remove_fragments: None, } } pub fn with_reindex(reindex: ReindexAction, was_quantized: bool) -> Self { - Self { was_quantized, is_being_quantized: false, write_back: None, reindex: Some(reindex) } + Self { + was_quantized, + is_being_quantized: false, + write_back: None, + reindex: Some(reindex), + remove_fragments: None, + } + } + + pub fn with_remove_fragments(mut self, remove_fragments: RemoveFragments) -> Self { + self.remove_fragments = Some(remove_fragments); + self } } @@ -634,6 +731,8 @@ impl SettingsDiff { mut dimensions, mut document_template, mut url, + mut indexing_fragments, + mut search_fragments, mut request, mut response, mut search_embedder, @@ -653,6 +752,8 @@ impl SettingsDiff { dimensions: new_dimensions, document_template: new_document_template, url: new_url, + indexing_fragments: new_indexing_fragments, + search_fragments: new_search_fragments, request: new_request, response: new_response, search_embedder: new_search_embedder, @@ -684,6 +785,8 @@ impl SettingsDiff { &mut document_template, &mut document_template_max_bytes, &mut url, + &mut indexing_fragments, + &mut search_fragments, &mut request, &mut response, &mut headers, @@ -696,6 +799,8 @@ impl SettingsDiff { new_document_template, new_document_template_max_bytes, new_url, + new_indexing_fragments, + new_search_fragments, new_request, new_response, new_headers, @@ -722,6 +827,8 @@ impl SettingsDiff { dimensions, document_template, url, + indexing_fragments, + search_fragments, request, response, search_embedder, @@ -769,6 +876,8 @@ impl SettingsDiff { mut document_template, mut document_template_max_bytes, mut url, + mut indexing_fragments, + mut search_fragments, mut request, mut response, mut headers, @@ -794,6 +903,8 @@ impl SettingsDiff { document_template: new_document_template, document_template_max_bytes: new_document_template_max_bytes, url: new_url, + indexing_fragments: new_indexing_fragments, + search_fragments: new_search_fragments, request: new_request, response: new_response, headers: new_headers, @@ -814,6 +925,8 @@ impl SettingsDiff { &mut document_template, &mut document_template_max_bytes, &mut url, + &mut indexing_fragments, + &mut search_fragments, &mut request, &mut response, &mut headers, @@ -826,6 +939,8 @@ impl SettingsDiff { new_document_template, new_document_template_max_bytes, new_url, + new_indexing_fragments, + new_search_fragments, new_request, new_response, new_headers, @@ -846,6 +961,8 @@ impl SettingsDiff { dimensions, document_template, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -875,6 +992,8 @@ impl SettingsDiff { document_template: &mut Setting, document_template_max_bytes: &mut Setting, url: &mut Setting, + indexing_fragments: &mut Setting>>, + search_fragments: &mut Setting>>, request: &mut Setting, response: &mut Setting, headers: &mut Setting>, @@ -887,6 +1006,8 @@ impl SettingsDiff { new_document_template: Setting, new_document_template_max_bytes: Setting, new_url: Setting, + new_indexing_fragments: Setting>>, + new_search_fragments: Setting>>, new_request: Setting, new_response: Setting, new_headers: Setting>, @@ -902,6 +1023,8 @@ impl SettingsDiff { pooling, dimensions, url, + indexing_fragments, + search_fragments, request, response, document_template, @@ -941,6 +1064,105 @@ impl SettingsDiff { } } } + + *search_fragments = match (std::mem::take(search_fragments), new_search_fragments) { + (Setting::Set(search_fragments), Setting::Set(new_search_fragments)) => { + Setting::Set( + search_fragments + .into_iter() + .merge_join_by(new_search_fragments, |(left, _), (right, _)| { + left.cmp(right) + }) + .map(|eob| { + match eob { + // merge fragments + itertools::EitherOrBoth::Both((name, _), (_, right)) => { + (name, right) + } + // unchanged fragment + itertools::EitherOrBoth::Left(left) => left, + // new fragment + itertools::EitherOrBoth::Right(right) => right, + } + }) + .collect(), + ) + } + (_, Setting::Reset) => Setting::Reset, + (left, Setting::NotSet) => left, + (Setting::NotSet | Setting::Reset, Setting::Set(new_search_fragments)) => { + Setting::Set(new_search_fragments) + } + }; + + let mut regenerate_fragments = Vec::new(); + *indexing_fragments = match (std::mem::take(indexing_fragments), new_indexing_fragments) { + (Setting::Set(fragments), Setting::Set(new_fragments)) => { + Setting::Set( + fragments + .into_iter() + .merge_join_by(new_fragments, |(left, _), (right, _)| left.cmp(right)) + .map(|eob| { + match eob { + // merge fragments + itertools::EitherOrBoth::Both( + (name, left), + (other_name, right), + ) => { + if left == right { + (name, left) + } else { + match right { + Some(right) => { + regenerate_fragments + .push((other_name, RegenerateFragment::Update)); + (name, Some(right)) + } + None => { + regenerate_fragments + .push((other_name, RegenerateFragment::Remove)); + (name, None) + } + } + } + } + // unchanged fragment + itertools::EitherOrBoth::Left(left) => left, + // new fragment + itertools::EitherOrBoth::Right((name, right)) => { + if right.is_some() { + regenerate_fragments + .push((name.clone(), RegenerateFragment::Add)); + } + (name, right) + } + } + }) + .collect(), + ) + } + // remove all fragments => move to document template + (_, Setting::Reset) => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + Setting::Reset + } + // add all fragments + (Setting::NotSet | Setting::Reset, Setting::Set(new_fragments)) => { + ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); + + Setting::Set(new_fragments) + } + // no change + (left, Setting::NotSet) => left, + }; + if !regenerate_fragments.is_empty() { + regenerate_fragments.sort_unstable_by(|(left, _), (right, _)| left.cmp(right)); + ReindexAction::push_action( + reindex_action, + ReindexAction::RegenerateFragments(regenerate_fragments), + ); + } + if request.apply(new_request) { ReindexAction::push_action(reindex_action, ReindexAction::FullReindex); } @@ -972,10 +1194,16 @@ impl SettingsDiff { impl ReindexAction { fn push_action(this: &mut Option, other: Self) { - *this = match (*this, other) { - (_, ReindexAction::FullReindex) => Some(ReindexAction::FullReindex), - (Some(ReindexAction::FullReindex), _) => Some(ReindexAction::FullReindex), - (_, ReindexAction::RegeneratePrompts) => Some(ReindexAction::RegeneratePrompts), + use ReindexAction::*; + *this = match (this.take(), other) { + (_, FullReindex) => Some(FullReindex), + (Some(FullReindex), _) => Some(FullReindex), + (_, RegenerateFragments(fragments)) => Some(RegenerateFragments(fragments)), + (Some(RegenerateFragments(fragments)), RegeneratePrompts) => { + Some(RegenerateFragments(fragments)) + } + (Some(RegeneratePrompts), RegeneratePrompts) => Some(RegeneratePrompts), + (None, RegeneratePrompts) => Some(RegeneratePrompts), } } } @@ -988,6 +1216,8 @@ fn apply_default_for_source( pooling: &mut Setting, dimensions: &mut Setting, url: &mut Setting, + indexing_fragments: &mut Setting>>, + search_fragments: &mut Setting>>, request: &mut Setting, response: &mut Setting, document_template: &mut Setting, @@ -1003,6 +1233,8 @@ fn apply_default_for_source( *pooling = Setting::Reset; *dimensions = Setting::NotSet; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1015,6 +1247,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1027,6 +1261,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::Reset; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *headers = Setting::NotSet; @@ -1039,6 +1275,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::Reset; + *indexing_fragments = Setting::Reset; + *search_fragments = Setting::Reset; *request = Setting::Reset; *response = Setting::Reset; *headers = Setting::Reset; @@ -1051,6 +1289,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::Reset; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *document_template = Setting::NotSet; @@ -1065,6 +1305,8 @@ fn apply_default_for_source( *pooling = Setting::NotSet; *dimensions = Setting::NotSet; *url = Setting::NotSet; + *indexing_fragments = Setting::NotSet; + *search_fragments = Setting::NotSet; *request = Setting::NotSet; *response = Setting::NotSet; *document_template = Setting::NotSet; @@ -1131,6 +1373,8 @@ pub enum MetaEmbeddingSetting { DocumentTemplate, DocumentTemplateMaxBytes, Url, + IndexingFragments, + SearchFragments, Request, Response, Headers, @@ -1153,6 +1397,8 @@ impl MetaEmbeddingSetting { DocumentTemplate => "documentTemplate", DocumentTemplateMaxBytes => "documentTemplateMaxBytes", Url => "url", + IndexingFragments => "indexingFragments", + SearchFragments => "searchFragments", Request => "request", Response => "response", Headers => "headers", @@ -1176,6 +1422,8 @@ impl EmbeddingSettings { dimensions: &Setting, api_key: &Setting, url: &Setting, + indexing_fragments: &Setting>>, + search_fragments: &Setting>>, request: &Setting, response: &Setting, document_template: &Setting, @@ -1210,6 +1458,20 @@ impl EmbeddingSettings { )?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::ApiKey, context, api_key)?; Self::check_setting(embedder_name, source, MetaEmbeddingSetting::Url, context, url)?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::IndexingFragments, + context, + indexing_fragments, + )?; + Self::check_setting( + embedder_name, + source, + MetaEmbeddingSetting::SearchFragments, + context, + search_fragments, + )?; Self::check_setting( embedder_name, source, @@ -1348,8 +1610,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( OpenAi, - Revision | Pooling | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + Revision | Pooling | IndexingFragments | SearchFragments | Request | Response + | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, ( @@ -1359,8 +1621,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( HuggingFace, - ApiKey | Dimensions | Url | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + ApiKey | Dimensions | Url | IndexingFragments | SearchFragments | Request + | Response | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, (Ollama, Model, _) => FieldStatus::Mandatory, @@ -1371,8 +1633,8 @@ impl EmbeddingSettings { ) => FieldStatus::Allowed, ( Ollama, - Revision | Pooling | Request | Response | Headers | SearchEmbedder - | IndexingEmbedder, + Revision | Pooling | IndexingFragments | SearchFragments | Request | Response + | Headers | SearchEmbedder | IndexingEmbedder, _, ) => FieldStatus::Disallowed, (UserProvided, Dimensions, _) => FieldStatus::Mandatory, @@ -1386,6 +1648,8 @@ impl EmbeddingSettings { | DocumentTemplate | DocumentTemplateMaxBytes | Url + | IndexingFragments + | SearchFragments | Request | Response | Headers @@ -1404,6 +1668,10 @@ impl EmbeddingSettings { | Headers, _, ) => FieldStatus::Allowed, + (Rest, IndexingFragments, NotNested | Indexing) => FieldStatus::Allowed, + (Rest, IndexingFragments, Search) => FieldStatus::Disallowed, + (Rest, SearchFragments, NotNested | Search) => FieldStatus::Allowed, + (Rest, SearchFragments, Indexing) => FieldStatus::Disallowed, (Rest, Model | Revision | Pooling | SearchEmbedder | IndexingEmbedder, _) => { FieldStatus::Disallowed } @@ -1419,6 +1687,8 @@ impl EmbeddingSettings { | DocumentTemplate | DocumentTemplateMaxBytes | Url + | IndexingFragments + | SearchFragments | Request | Response | Headers, @@ -1512,6 +1782,11 @@ impl std::fmt::Display for EmbedderSource { } } +#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq, Deserr, ToSchema)] +pub struct Fragment { + pub value: serde_json::Value, +} + impl EmbeddingSettings { fn from_hugging_face( super::hf::EmbedderOptions { @@ -1534,6 +1809,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1566,6 +1843,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::some_or_not_set(url), + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1598,6 +1877,8 @@ impl EmbeddingSettings { document_template, document_template_max_bytes, url: Setting::some_or_not_set(url), + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1622,6 +1903,8 @@ impl EmbeddingSettings { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1638,6 +1921,8 @@ impl EmbeddingSettings { dimensions, url, request, + indexing_fragments, + search_fragments, response, distribution, headers, @@ -1653,9 +1938,39 @@ impl EmbeddingSettings { pooling: Setting::NotSet, api_key: Setting::some_or_not_set(api_key), dimensions: Setting::some_or_not_set(dimensions), - document_template, - document_template_max_bytes, + document_template: if indexing_fragments.is_empty() && search_fragments.is_empty() { + document_template + } else { + Setting::NotSet + }, + document_template_max_bytes: if indexing_fragments.is_empty() + && search_fragments.is_empty() + { + document_template_max_bytes + } else { + Setting::NotSet + }, url: Setting::Set(url), + indexing_fragments: if indexing_fragments.is_empty() { + Setting::NotSet + } else { + Setting::Set( + indexing_fragments + .into_iter() + .map(|(name, fragment)| (name, Some(Fragment { value: fragment }))) + .collect(), + ) + }, + search_fragments: if search_fragments.is_empty() { + Setting::NotSet + } else { + Setting::Set( + search_fragments + .into_iter() + .map(|(name, fragment)| (name, Some(Fragment { value: fragment }))) + .collect(), + ) + }, request: Setting::Set(request), response: Setting::Set(response), distribution: Setting::some_or_not_set(distribution), @@ -1714,6 +2029,8 @@ impl From for EmbeddingSettings { document_template: Setting::NotSet, document_template_max_bytes: Setting::NotSet, url: Setting::NotSet, + indexing_fragments: Setting::NotSet, + search_fragments: Setting::NotSet, request: Setting::NotSet, response: Setting::NotSet, headers: Setting::NotSet, @@ -1786,6 +2103,8 @@ impl From for SubEmbeddingSettings { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1804,6 +2123,8 @@ impl From for SubEmbeddingSettings { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1828,6 +2149,8 @@ impl From for EmbeddingConfig { document_template, document_template_max_bytes, url, + indexing_fragments, + search_fragments, request, response, distribution, @@ -1879,6 +2202,8 @@ impl From for EmbeddingConfig { EmbedderSource::Rest => SubEmbedderOptions::rest( url.set().unwrap(), api_key, + indexing_fragments, + search_fragments, request.set().unwrap(), response.set().unwrap(), headers, @@ -1922,6 +2247,8 @@ impl SubEmbedderOptions { document_template: _, document_template_max_bytes: _, url, + indexing_fragments, + search_fragments, request, response, headers, @@ -1944,6 +2271,8 @@ impl SubEmbedderOptions { EmbedderSource::Rest => Self::rest( url.set().unwrap(), api_key, + indexing_fragments, + search_fragments, request.set().unwrap(), response.set().unwrap(), headers, @@ -2010,9 +2339,13 @@ impl SubEmbedderOptions { distribution: distribution.set(), }) } + + #[allow(clippy::too_many_arguments)] fn rest( url: String, api_key: Setting, + indexing_fragments: Setting>>, + search_fragments: Setting>>, request: serde_json::Value, response: serde_json::Value, headers: Setting>, @@ -2027,6 +2360,22 @@ impl SubEmbedderOptions { response, distribution: distribution.set(), headers: headers.set().unwrap_or_default(), + search_fragments: search_fragments + .set() + .unwrap_or_default() + .into_iter() + .filter_map(|(name, fragment)| { + Some((name, fragment.map(|fragment| fragment.value)?)) + }) + .collect(), + indexing_fragments: indexing_fragments + .set() + .unwrap_or_default() + .into_iter() + .filter_map(|(name, fragment)| { + Some((name, fragment.map(|fragment| fragment.value)?)) + }) + .collect(), }) } fn ollama( @@ -2066,3 +2415,29 @@ impl From for EmbedderOptions { } } } + +pub(crate) fn fragments_from_settings( + setting: &Setting, +) -> impl Iterator + '_ { + let Some(setting) = setting.as_ref().set() else { return Either::Left(None.into_iter()) }; + + let filter_map = |(name, fragment): (&String, &Option)| { + if fragment.is_some() { + Some(name.clone()) + } else { + None + } + }; + + if let Some(setting) = setting.indexing_fragments.as_ref().set() { + Either::Right(setting.iter().filter_map(filter_map)) + } else { + let Some(setting) = setting.indexing_embedder.as_ref().set() else { + return Either::Left(None.into_iter()); + }; + let Some(setting) = setting.indexing_fragments.as_ref().set() else { + return Either::Left(None.into_iter()); + }; + Either::Right(setting.iter().filter_map(filter_map)) + } +} diff --git a/crates/milli/tests/search/facet_distribution.rs b/crates/milli/tests/search/facet_distribution.rs index d04db425e..cc1b85369 100644 --- a/crates/milli/tests/search/facet_distribution.rs +++ b/crates/milli/tests/search/facet_distribution.rs @@ -5,7 +5,7 @@ use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{FacetDistribution, FilterableAttributesRule, Index, Object, OrderBy}; use serde_json::{from_value, json}; @@ -35,7 +35,7 @@ fn test_facet_distribution_with_no_facet_values() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let doc1: Object = from_value( diff --git a/crates/milli/tests/search/mod.rs b/crates/milli/tests/search/mod.rs index 3ee78561d..fa03f1cc1 100644 --- a/crates/milli/tests/search/mod.rs +++ b/crates/milli/tests/search/mod.rs @@ -10,7 +10,7 @@ use maplit::{btreemap, hashset}; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{ AscDesc, Criterion, DocumentId, FilterableAttributesRule, Index, Member, TermsMatchingStrategy, }; @@ -74,7 +74,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/tests/search/query_criteria.rs b/crates/milli/tests/search/query_criteria.rs index cb0c23e42..3f8134085 100644 --- a/crates/milli/tests/search/query_criteria.rs +++ b/crates/milli/tests/search/query_criteria.rs @@ -8,7 +8,7 @@ use maplit::hashset; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult, TermsMatchingStrategy}; use rand::Rng; use Criterion::*; @@ -288,7 +288,7 @@ fn criteria_ascdesc() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); let mut file = tempfile::tempfile().unwrap(); diff --git a/crates/milli/tests/search/typo_tolerance.rs b/crates/milli/tests/search/typo_tolerance.rs index 49c9c7b5d..95ff85165 100644 --- a/crates/milli/tests/search/typo_tolerance.rs +++ b/crates/milli/tests/search/typo_tolerance.rs @@ -6,7 +6,7 @@ use milli::documents::mmap_from_objects; use milli::progress::Progress; use milli::update::new::indexer; use milli::update::{IndexerConfig, Settings}; -use milli::vector::EmbeddingConfigs; +use milli::vector::RuntimeEmbedders; use milli::{Criterion, Index, Object, Search, TermsMatchingStrategy}; use serde_json::from_value; use tempfile::tempdir; @@ -123,7 +123,7 @@ fn test_typo_disabled_on_word() { let db_fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); let mut new_fields_ids_map = db_fields_ids_map.clone(); - let embedders = EmbeddingConfigs::default(); + let embedders = RuntimeEmbedders::default(); let mut indexer = indexer::DocumentOperation::new(); indexer.replace_documents(&documents).unwrap();