From 915cf4bae5ffe1bde0ae61b0e9475554fb3199aa Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 11:28:53 +0200 Subject: [PATCH 1/7] Add field.is_searchable property to fields --- milli/src/prompt/context.rs | 4 ++-- milli/src/prompt/fields.rs | 24 ++++++++++++++++------ milli/src/prompt/mod.rs | 40 +++++++++++++++++++++++++++++++++++-- 3 files changed, 58 insertions(+), 10 deletions(-) diff --git a/milli/src/prompt/context.rs b/milli/src/prompt/context.rs index a28a87caa..7ab08301a 100644 --- a/milli/src/prompt/context.rs +++ b/milli/src/prompt/context.rs @@ -5,7 +5,7 @@ use liquid::{ObjectView, ValueView}; use super::document::Document; use super::fields::Fields; -use crate::FieldsIdsMap; +use super::FieldsIdsMapWithMetadata; #[derive(Debug, Clone)] pub struct Context<'a> { @@ -14,7 +14,7 @@ pub struct Context<'a> { } impl<'a> Context<'a> { - pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self { + pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { Self { document, fields: Fields::new(document, field_id_map) } } } diff --git a/milli/src/prompt/fields.rs b/milli/src/prompt/fields.rs index 3187485f1..81ea88ca6 100644 --- a/milli/src/prompt/fields.rs +++ b/milli/src/prompt/fields.rs @@ -4,16 +4,20 @@ use liquid::model::{ use liquid::{ObjectView, ValueView}; use super::document::Document; -use crate::FieldsIdsMap; +use super::{FieldMetadata, FieldsIdsMapWithMetadata}; #[derive(Debug, Clone)] pub struct Fields<'a>(Vec>); impl<'a> Fields<'a> { - pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMap) -> Self { + pub fn new(document: &'a Document<'a>, field_id_map: &'a FieldsIdsMapWithMetadata<'a>) -> Self { Self( std::iter::repeat(document) .zip(field_id_map.iter()) - .map(|(document, (_fid, name))| FieldValue { document, name }) + .map(|(document, (fid, name))| FieldValue { + document, + name, + metadata: field_id_map.metadata(fid).unwrap_or_default(), + }) .collect(), ) } @@ -23,6 +27,7 @@ impl<'a> Fields<'a> { pub struct FieldValue<'a> { name: &'a str, document: &'a Document<'a>, + metadata: FieldMetadata, } impl<'a> ValueView for FieldValue<'a> { @@ -74,6 +79,10 @@ impl<'a> FieldValue<'a> { self.document.get(self.name).unwrap_or(&LiquidValue::Nil) } + pub fn is_searchable(&self) -> &bool { + &self.metadata.searchable + } + pub fn is_empty(&self) -> bool { self.size() == 0 } @@ -89,12 +98,14 @@ impl<'a> ObjectView for FieldValue<'a> { } fn keys<'k>(&'k self) -> Box> + 'k> { - Box::new(["name", "value"].iter().map(|&x| KStringCow::from_static(x))) + Box::new(["name", "value", "is_searchable"].iter().map(|&x| KStringCow::from_static(x))) } fn values<'k>(&'k self) -> Box + 'k> { Box::new( - std::iter::once(self.name() as &dyn ValueView).chain(std::iter::once(self.value())), + std::iter::once(self.name() as &dyn ValueView) + .chain(std::iter::once(self.value())) + .chain(std::iter::once(self.is_searchable() as &dyn ValueView)), ) } @@ -103,13 +114,14 @@ impl<'a> ObjectView for FieldValue<'a> { } fn contains_key(&self, index: &str) -> bool { - index == "name" || index == "value" + index == "name" || index == "value" || index == "is_searchable" } fn get<'s>(&'s self, index: &str) -> Option<&'s dyn ValueView> { match index { "name" => Some(self.name()), "value" => Some(self.value()), + "is_searchable" => Some(self.is_searchable()), _ => None, } } diff --git a/milli/src/prompt/mod.rs b/milli/src/prompt/mod.rs index 97ccbfb61..b7ea24f97 100644 --- a/milli/src/prompt/mod.rs +++ b/milli/src/prompt/mod.rs @@ -4,14 +4,16 @@ pub(crate) mod error; mod fields; mod template_checker; +use std::collections::BTreeMap; use std::convert::TryFrom; +use std::ops::Deref; use error::{NewPromptError, RenderPromptError}; use self::context::Context; use self::document::Document; use crate::update::del_add::DelAdd; -use crate::FieldsIdsMap; +use crate::{FieldId, FieldsIdsMap}; pub struct Prompt { template: liquid::Template, @@ -93,7 +95,7 @@ impl Prompt { &self, document: obkv::KvReaderU16<'_>, side: DelAdd, - field_id_map: &FieldsIdsMap, + field_id_map: &FieldsIdsMapWithMetadata, ) -> Result { let document = Document::new(document, side, field_id_map); let context = Context::new(&document, field_id_map); @@ -102,6 +104,40 @@ impl Prompt { } } +pub struct FieldsIdsMapWithMetadata<'a> { + fields_ids_map: &'a FieldsIdsMap, + metadata: BTreeMap, +} + +impl<'a> FieldsIdsMapWithMetadata<'a> { + pub fn new(fields_ids_map: &'a FieldsIdsMap, searchable_fields_ids: &'_ [FieldId]) -> Self { + let mut metadata: BTreeMap = + fields_ids_map.ids().map(|id| (id, Default::default())).collect(); + for searchable_field_id in searchable_fields_ids { + let Some(metadata) = metadata.get_mut(searchable_field_id) else { continue }; + metadata.searchable = true; + } + Self { fields_ids_map, metadata } + } + + pub fn metadata(&self, field_id: FieldId) -> Option { + self.metadata.get(&field_id).copied() + } +} + +impl<'a> Deref for FieldsIdsMapWithMetadata<'a> { + type Target = FieldsIdsMap; + + fn deref(&self) -> &Self::Target { + self.fields_ids_map + } +} + +#[derive(Debug, Default, Clone, Copy)] +pub struct FieldMetadata { + pub searchable: bool, +} + #[cfg(test)] mod test { use super::Prompt; From 580ea2f45078c42c981e37e3f71b9972fcbed0a8 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 11:30:10 +0200 Subject: [PATCH 2/7] Pass the fields <-> ids map with metadata to render --- .../extract/extract_vector_points.rs | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index f66c3fd46..e9b83b92c 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -15,14 +15,14 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::FaultSource; use crate::index::IndexEmbeddingConfig; -use crate::prompt::Prompt; +use crate::prompt::{FieldsIdsMapWithMetadata, Prompt}; use crate::update::del_add::{DelAdd, KvReaderDelAdd, KvWriterDelAdd}; use crate::update::settings::InnerIndexSettingsDiff; use crate::vector::error::{EmbedErrorKind, PossibleEmbeddingMistakes, UnusedVectorsDistribution}; use crate::vector::parsed_vectors::{ParsedVectorsDiff, VectorState, RESERVED_VECTORS_FIELD_NAME}; use crate::vector::settings::{EmbedderAction, ReindexAction}; use crate::vector::{Embedder, Embeddings}; -use crate::{try_split_array_at, DocumentId, FieldId, FieldsIdsMap, Result, ThreadPoolNoAbort}; +use crate::{try_split_array_at, DocumentId, FieldId, Result, ThreadPoolNoAbort}; /// The length of the elements that are always in the buffer when inserting new values. const TRUNCATE_SIZE: usize = size_of::(); @@ -189,7 +189,13 @@ pub fn extract_vector_points( let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; + let old_fields_ids_map = + FieldsIdsMapWithMetadata::new(old_fields_ids_map, &settings_diff.old.searchable_fields_ids); + let new_fields_ids_map = &settings_diff.new.fields_ids_map; + let new_fields_ids_map = + FieldsIdsMapWithMetadata::new(new_fields_ids_map, &settings_diff.new.searchable_fields_ids); + // the vector field id may have changed let old_vectors_fid = old_fields_ids_map.id(RESERVED_VECTORS_FIELD_NAME); @@ -376,7 +382,7 @@ pub fn extract_vector_points( ); continue; } - regenerate_prompt(obkv, prompt, new_fields_ids_map)? + regenerate_prompt(obkv, prompt, &new_fields_ids_map)? } }, // prompt regeneration is only triggered for existing embedders @@ -393,7 +399,7 @@ pub fn extract_vector_points( regenerate_if_prompt_changed( obkv, (old_prompt, prompt), - (old_fields_ids_map, new_fields_ids_map), + (&old_fields_ids_map, &new_fields_ids_map), )? } else { // we can simply ignore user provided vectors as they are not regenerated and are @@ -409,7 +415,7 @@ pub fn extract_vector_points( prompt, (add_to_user_provided, remove_from_user_provided), (old, new), - (old_fields_ids_map, new_fields_ids_map), + (&old_fields_ids_map, &new_fields_ids_map), document_id, embedder_name, embedder_is_manual, @@ -479,7 +485,10 @@ fn extract_vector_document_diff( prompt: &Prompt, (add_to_user_provided, remove_from_user_provided): (&mut RoaringBitmap, &mut RoaringBitmap), (old, new): (VectorState, VectorState), - (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), + (old_fields_ids_map, new_fields_ids_map): ( + &FieldsIdsMapWithMetadata, + &FieldsIdsMapWithMetadata, + ), document_id: impl Fn() -> Value, embedder_name: &str, embedder_is_manual: bool, @@ -599,7 +608,10 @@ fn extract_vector_document_diff( fn regenerate_if_prompt_changed( obkv: obkv::KvReader<'_, FieldId>, (old_prompt, new_prompt): (&Prompt, &Prompt), - (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), + (old_fields_ids_map, new_fields_ids_map): ( + &FieldsIdsMapWithMetadata, + &FieldsIdsMapWithMetadata, + ), ) -> Result { let old_prompt = old_prompt.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or(Default::default()); @@ -614,7 +626,7 @@ fn regenerate_if_prompt_changed( fn regenerate_prompt( obkv: obkv::KvReader<'_, FieldId>, prompt: &Prompt, - new_fields_ids_map: &FieldsIdsMap, + new_fields_ids_map: &FieldsIdsMapWithMetadata, ) -> Result { let prompt = prompt.render(obkv, DelAdd::Addition, new_fields_ids_map)?; From 4464d319af5a6db8f0eab981362436006336cf4c Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 11:30:59 +0200 Subject: [PATCH 3/7] Change default template to use the new facility --- milli/src/prompt/mod.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/milli/src/prompt/mod.rs b/milli/src/prompt/mod.rs index b7ea24f97..47f949ea5 100644 --- a/milli/src/prompt/mod.rs +++ b/milli/src/prompt/mod.rs @@ -55,8 +55,10 @@ fn default_template() -> liquid::Template { } fn default_template_text() -> &'static str { - "{% for field in fields %} \ + "{% for field in fields %}\ + {% if field.is_searchable and field.value != nil %}\ {{ field.name }}: {{ field.value }}\n\ + {% endif %}\ {% endfor %}" } From 30a143f1493958f26415ed17f48d37c63c5a18bf Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 11:31:23 +0200 Subject: [PATCH 4/7] Test new facilities --- meilisearch/tests/vector/rest.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/meilisearch/tests/vector/rest.rs b/meilisearch/tests/vector/rest.rs index 1a64eeb78..d026b9dbb 100644 --- a/meilisearch/tests/vector/rest.rs +++ b/meilisearch/tests/vector/rest.rs @@ -1100,6 +1100,7 @@ async fn server_returns_bad_request() { let (response, code) = index .update_settings(json!({ + "searchableAttributes": ["name", "missing_field"], "embedders": { "rest": json!({ "source": "rest", "url": mock.uri(), "request": "{{text}}", "response": "{{embedding}}", "dimensions": 3 }), }, @@ -1115,6 +1116,10 @@ async fn server_returns_bad_request() { "type": "settingsUpdate", "canceledBy": null, "details": { + "searchableAttributes": [ + "name", + "missing_field" + ], "embedders": { "rest": { "source": "rest", @@ -1148,7 +1153,7 @@ async fn server_returns_bad_request() { "indexedDocuments": 0 }, "error": { - "message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\" id: 1\\\\n name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 24\"}`", + "message": "While embedding documents for embedder `rest`: user error: sent a bad request to embedding server\n - Hint: check that the `request` in the embedder configuration matches the remote server's API\n - server replied with `{\"error\":\"Invalid request: invalid type: string \\\"name: kefir\\\\n\\\", expected struct MultipleRequest at line 1 column 15\"}`", "code": "vector_embedding_error", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#vector_embedding_error" From 03fda78901f555be6ec500e421516185f711b880 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 11:31:31 +0200 Subject: [PATCH 5/7] update other tests --- index-scheduler/src/lib.rs | 4 ++-- meilisearch/tests/settings/get_settings.rs | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/index-scheduler/src/lib.rs b/index-scheduler/src/lib.rs index 21e503567..705c7e9e3 100644 --- a/index-scheduler/src/lib.rs +++ b/index-scheduler/src/lib.rs @@ -5617,7 +5617,7 @@ mod tests { }, ), prompt: PromptData { - template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", }, }, user_provided: RoaringBitmap<[0]>, @@ -5657,7 +5657,7 @@ mod tests { }, ), prompt: PromptData { - template: "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + template: "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", }, }, user_provided: RoaringBitmap<[]>, diff --git a/meilisearch/tests/settings/get_settings.rs b/meilisearch/tests/settings/get_settings.rs index 1571b8ca6..58bf958d7 100644 --- a/meilisearch/tests/settings/get_settings.rs +++ b/meilisearch/tests/settings/get_settings.rs @@ -190,7 +190,7 @@ async fn secrets_are_hidden_in_settings() { "source": "rest", "apiKey": "My suXXXXXX...", "dimensions": 4, - "documentTemplate": "{% for field in fields %} {{ field.name }}: {{ field.value }}\n{% endfor %}", + "documentTemplate": "{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}", "url": "https://localhost:7777", "request": "{{text}}", "response": "{{embedding}}", From 21296190a3643b5b07f36a66b6d372cc71be9355 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 12:58:09 +0200 Subject: [PATCH 6/7] Reindex embedders --- milli/src/update/settings.rs | 28 +++++++++++++++++++++++++++- milli/src/vector/mod.rs | 10 ++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 9799fc6ec..29470521e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1238,7 +1238,7 @@ impl InnerIndexSettingsDiff { old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, primary_key_id: Option, - embedding_config_updates: BTreeMap, + mut embedding_config_updates: BTreeMap, settings_update_only: bool, ) -> Self { let only_additional_fields = match ( @@ -1273,6 +1273,32 @@ impl InnerIndexSettingsDiff { let cache_user_defined_searchables = old_settings.user_defined_searchable_fields != new_settings.user_defined_searchable_fields; + // if the user-defined searchables changed, then we need to reindex prompts. + if cache_user_defined_searchables { + for (embedder_name, (config, _)) in new_settings.embedding_configs.inner_as_ref() { + // skip embedders that don't use document templates + if !config.uses_document_template() { + continue; + } + + // note: this could currently be entry.or_insert(..), but we're future-proofing with an explicit match + // this always makes the code clearer by explicitly handling the cases + match embedding_config_updates.entry(embedder_name.clone()) { + std::collections::btree_map::Entry::Vacant(entry) => { + entry.insert(EmbedderAction::Reindex(ReindexAction::RegeneratePrompts)); + } + std::collections::btree_map::Entry::Occupied(entry) => match entry.get() { + EmbedderAction::WriteBackToDocuments(_) => { /* we are deleting this embedder, so no point in regeneration */ + } + EmbedderAction::Reindex(ReindexAction::FullReindex) => { /* we are already fully reindexing */ + } + EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { /* we are already regenerating prompts */ + } + }, + }; + } + } + InnerIndexSettingsDiff { old: old_settings, new: new_settings, diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index caccb404b..04e646819 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -305,6 +305,16 @@ impl Embedder { Embedder::Rest(embedder) => embedder.distribution(), } } + + pub fn uses_document_template(&self) -> bool { + match self { + Embedder::HuggingFace(_) + | Embedder::OpenAi(_) + | Embedder::Ollama(_) + | Embedder::Rest(_) => true, + Embedder::UserProvided(_) => false, + } + } } /// Describes the mean and sigma of distribution of embedding similarity in the embedding space. From 24ace5c38138bf355da42771d42d217fbd67efc0 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 13:37:01 +0200 Subject: [PATCH 7/7] Add reindexing test --- meilisearch/tests/vector/rest.rs | 147 +++++++++++++++++++++++++++++++ 1 file changed, 147 insertions(+) diff --git a/meilisearch/tests/vector/rest.rs b/meilisearch/tests/vector/rest.rs index d026b9dbb..2748d0846 100644 --- a/meilisearch/tests/vector/rest.rs +++ b/meilisearch/tests/vector/rest.rs @@ -1,3 +1,4 @@ +use std::collections::BTreeMap; use std::sync::atomic::{AtomicUsize, Ordering}; use meili_snap::{json_string, snapshot}; @@ -37,6 +38,46 @@ async fn create_mock() -> (MockServer, Value) { (mock_server, embedder_settings) } +async fn create_mock_map() -> (MockServer, Value) { + let mock_server = MockServer::start().await; + + let text_to_embedding: BTreeMap<_, _> = vec![ + // text -> embedding + ("name: kefir\n", [0.0, 0.1, 0.2]), + ] + // turn into btree + .into_iter() + .collect(); + + Mock::given(method("POST")) + .and(path("/")) + .respond_with(move |req: &Request| { + let text: String = req.body_json().unwrap(); + match text_to_embedding.get(text.as_str()) { + Some(embedding) => { + ResponseTemplate::new(200).set_body_json(json!({ "data": embedding })) + } + None => ResponseTemplate::new(404) + .set_body_json(json!({"error": "text not found", "text": text})), + } + }) + .mount(&mock_server) + .await; + let url = mock_server.uri(); + + let embedder_settings = json!({ + "source": "rest", + "url": url, + "dimensions": 3, + "request": "{{text}}", + "response": { + "data": "{{embedding}}" + } + }); + + (mock_server, embedder_settings) +} + #[derive(Debug, Clone, serde::Deserialize, serde::Serialize)] struct MultipleRequest { input: Vec, @@ -1896,3 +1937,109 @@ async fn server_custom_header() { } "###); } + +#[actix_rt::test] +async fn searchable_reindex() { + let (_mock, setting) = create_mock_map().await; + let server = get_server_vector().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "searchableAttributes": ["name", "missing_field"], + "embedders": { + "rest": setting, + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "searchableAttributes": [ + "name", + "missing_field" + ], + "embedders": { + "rest": { + "source": "rest", + "dimensions": 3, + "url": "[url]", + "request": "{{text}}", + "response": { + "data": "{{embedding}}" + } + } + } + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + let (response, code) = + index.add_documents(json!( { "id": 1, "name": "kefir", "breed": "patou" }), None).await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "succeeded", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 1 + }, + "error": null, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); + + // triggers reindexing with the new searchable attribute. + // as the mock intentionally doesn't know of this text, the task will fail, outputting the putative rendered text. + let (response, code) = index + .update_settings(json!({ + "searchableAttributes": ["breed"], + })) + .await; + snapshot!(code, @"202 Accepted"); + let task = server.wait_task(response.uid()).await; + snapshot!(task, @r###" + { + "uid": "[uid]", + "indexUid": "doggo", + "status": "failed", + "type": "settingsUpdate", + "canceledBy": null, + "details": { + "searchableAttributes": [ + "breed" + ] + }, + "error": { + "message": "While embedding documents for embedder `rest`: error: received unexpected HTTP 404 from embedding server\n - server replied with `{\"error\":\"text not found\",\"text\":\"breed: patou\\n\"}`", + "code": "vector_embedding_error", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#vector_embedding_error" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "###); +}