From 21296190a3643b5b07f36a66b6d372cc71be9355 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 2 Sep 2024 12:58:09 +0200 Subject: [PATCH] Reindex embedders --- milli/src/update/settings.rs | 28 +++++++++++++++++++++++++++- milli/src/vector/mod.rs | 10 ++++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 9799fc6ec..29470521e 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1238,7 +1238,7 @@ impl InnerIndexSettingsDiff { old_settings: InnerIndexSettings, new_settings: InnerIndexSettings, primary_key_id: Option, - embedding_config_updates: BTreeMap, + mut embedding_config_updates: BTreeMap, settings_update_only: bool, ) -> Self { let only_additional_fields = match ( @@ -1273,6 +1273,32 @@ impl InnerIndexSettingsDiff { let cache_user_defined_searchables = old_settings.user_defined_searchable_fields != new_settings.user_defined_searchable_fields; + // if the user-defined searchables changed, then we need to reindex prompts. + if cache_user_defined_searchables { + for (embedder_name, (config, _)) in new_settings.embedding_configs.inner_as_ref() { + // skip embedders that don't use document templates + if !config.uses_document_template() { + continue; + } + + // note: this could currently be entry.or_insert(..), but we're future-proofing with an explicit match + // this always makes the code clearer by explicitly handling the cases + match embedding_config_updates.entry(embedder_name.clone()) { + std::collections::btree_map::Entry::Vacant(entry) => { + entry.insert(EmbedderAction::Reindex(ReindexAction::RegeneratePrompts)); + } + std::collections::btree_map::Entry::Occupied(entry) => match entry.get() { + EmbedderAction::WriteBackToDocuments(_) => { /* we are deleting this embedder, so no point in regeneration */ + } + EmbedderAction::Reindex(ReindexAction::FullReindex) => { /* we are already fully reindexing */ + } + EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => { /* we are already regenerating prompts */ + } + }, + }; + } + } + InnerIndexSettingsDiff { old: old_settings, new: new_settings, diff --git a/milli/src/vector/mod.rs b/milli/src/vector/mod.rs index caccb404b..04e646819 100644 --- a/milli/src/vector/mod.rs +++ b/milli/src/vector/mod.rs @@ -305,6 +305,16 @@ impl Embedder { Embedder::Rest(embedder) => embedder.distribution(), } } + + pub fn uses_document_template(&self) -> bool { + match self { + Embedder::HuggingFace(_) + | Embedder::OpenAi(_) + | Embedder::Ollama(_) + | Embedder::Rest(_) => true, + Embedder::UserProvided(_) => false, + } + } } /// Describes the mean and sigma of distribution of embedding similarity in the embedding space.