Update embedder configs with a finer granularity

- no longer clear vector DB between any two embedder changes
This commit is contained in:
Louis Dureuil 2024-06-12 14:04:54 +02:00
parent d0b05ae691
commit d18c1f77d7
No known key found for this signature in database

View File

@ -23,7 +23,10 @@ use crate::proximity::ProximityPrecision;
use crate::update::index_documents::IndexDocumentsMethod; use crate::update::index_documents::IndexDocumentsMethod;
use crate::update::{IndexDocuments, UpdateIndexingStep}; use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME; use crate::vector::parsed_vectors::RESERVED_VECTORS_FIELD_NAME;
use crate::vector::settings::{check_set, check_unset, EmbedderSource, EmbeddingSettings}; use crate::vector::settings::{
check_set, check_unset, EmbedderAction, EmbedderSource, EmbeddingSettings, ReindexAction,
WriteBackToDocuments,
};
use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs}; use crate::vector::{Embedder, EmbeddingConfig, EmbeddingConfigs};
use crate::{FieldId, FieldsIdsMap, Index, Result}; use crate::{FieldId, FieldsIdsMap, Index, Result};
@ -924,111 +927,177 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
Ok(changed) Ok(changed)
} }
fn update_embedding_configs(&mut self) -> Result<bool> { fn update_embedding_configs(&mut self) -> Result<BTreeMap<String, EmbedderAction>> {
let update = match std::mem::take(&mut self.embedder_settings) { match std::mem::take(&mut self.embedder_settings) {
Setting::Set(configs) => { Setting::Set(configs) => self.update_embedding_configs_set(configs),
let mut changed = false; Setting::Reset => {
// all vectors should be written back to documents
let old_configs = self.index.embedding_configs(self.wtxn)?; let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, (Setting<EmbeddingSettings>, RoaringBitmap)> = let remove_all: Result<BTreeMap<String, EmbedderAction>> = old_configs
old_configs
.into_iter()
.map(
|IndexEmbeddingConfig { name, config, user_provided: user_defined }| {
(name, (Setting::Set(config.into()), user_defined))
},
)
.collect();
let mut new_configs = BTreeMap::new();
for joined in old_configs
.into_iter() .into_iter()
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right)) .map(|IndexEmbeddingConfig { name, config: _, user_provided }| -> Result<_> {
{ let embedder_id =
match joined { self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
// updated config crate::InternalError::DatabaseMissingEntry {
EitherOrBoth::Both((name, (mut old, user_provided)), (_, new)) => { db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
changed |= EmbeddingSettings::apply_and_need_reindex(&mut old, new); key: None,
if changed { },
tracing::debug!( )?;
embedder = name, Ok((
user_provided = user_provided.len(),
"need reindex"
);
} else {
tracing::debug!(embedder = name, "skip reindex");
}
let new = validate_embedding_settings(old, &name)?;
new_configs.insert(name, (new, user_provided));
}
// unchanged config
EitherOrBoth::Left((name, setting)) => {
new_configs.insert(name, setting);
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(
&mut setting,
);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
changed = true;
new_configs.insert(name, (setting, RoaringBitmap::new()));
}
}
}
let new_configs: Vec<IndexEmbeddingConfig> = new_configs
.into_iter()
.filter_map(|(name, (config, user_provided))| match config {
Setting::Set(config) => Some(IndexEmbeddingConfig {
name, name,
config: config.into(), EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
user_provided, embedder_id,
}), user_provided,
Setting::Reset => None, }),
Setting::NotSet => Some(IndexEmbeddingConfig { ))
name,
config: EmbeddingSettings::default().into(),
user_provided,
}),
}) })
.collect(); .collect();
let remove_all = remove_all?;
self.index.embedder_category_id.clear(self.wtxn)?; self.index.embedder_category_id.clear(self.wtxn)?;
for (index, index_embedding_config) in new_configs.iter().enumerate() {
self.index.embedder_category_id.put_with_flags(
self.wtxn,
heed::PutFlags::APPEND,
&index_embedding_config.name,
&index
.try_into()
.map_err(|_| UserError::TooManyEmbedders(new_configs.len()))?,
)?;
}
if new_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, new_configs)?;
}
changed
}
Setting::Reset => {
self.index.delete_embedding_configs(self.wtxn)?; self.index.delete_embedding_configs(self.wtxn)?;
true Ok(remove_all)
} }
Setting::NotSet => false, Setting::NotSet => Ok(Default::default()),
};
// if any changes force a reindexing
// clear the vector database.
if update {
self.index.vector_arroy.clear(self.wtxn)?;
} }
}
Ok(update) fn update_embedding_configs_set(
&mut self,
configs: BTreeMap<String, Setting<EmbeddingSettings>>,
) -> Result<BTreeMap<String, EmbedderAction>> {
use crate::vector::settings::SettingsDiff;
let old_configs = self.index.embedding_configs(self.wtxn)?;
let old_configs: BTreeMap<String, (EmbeddingSettings, RoaringBitmap)> = old_configs
.into_iter()
.map(|IndexEmbeddingConfig { name, config, user_provided }| {
(name, (config.into(), user_provided))
})
.collect();
let mut updated_configs = BTreeMap::new();
let mut embedder_actions = BTreeMap::new();
for joined in old_configs
.into_iter()
.merge_join_by(configs.into_iter(), |(left, _), (right, _)| left.cmp(right))
{
match joined {
// updated config
EitherOrBoth::Both((name, (old, user_provided)), (_, new)) => {
let settings_diff = SettingsDiff::from_settings(old, new);
match settings_diff {
SettingsDiff::Remove => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"removing embedder"
);
let embedder_id =
self.index.embedder_category_id.get(self.wtxn, &name)?.ok_or(
crate::InternalError::DatabaseMissingEntry {
db_name: crate::index::db_name::VECTOR_EMBEDDER_CATEGORY_ID,
key: None,
},
)?;
// free id immediately
self.index.embedder_category_id.delete(self.wtxn, &name)?;
embedder_actions.insert(
name,
EmbedderAction::WriteBackToDocuments(WriteBackToDocuments {
embedder_id,
user_provided,
}),
);
}
SettingsDiff::Reindex { action, updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
?action,
"reindex embedder"
);
embedder_actions.insert(name.clone(), EmbedderAction::Reindex(action));
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
SettingsDiff::UpdateWithoutReindex { updated_settings } => {
tracing::debug!(
embedder = name,
user_provided = user_provided.len(),
"update without reindex embedder"
);
let new =
validate_embedding_settings(Setting::Set(updated_settings), &name)?;
updated_configs.insert(name, (new, user_provided));
}
}
}
// unchanged config
EitherOrBoth::Left((name, (setting, user_provided))) => {
tracing::debug!(embedder = name, "unchanged embedder");
updated_configs.insert(name, (Setting::Set(setting), user_provided));
}
// new config
EitherOrBoth::Right((name, mut setting)) => {
tracing::debug!(embedder = name, "new embedder");
// apply the default source in case the source was not set so that it gets validated
crate::vector::settings::EmbeddingSettings::apply_default_source(&mut setting);
crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting,
);
let setting = validate_embedding_settings(setting, &name)?;
embedder_actions
.insert(name.clone(), EmbedderAction::Reindex(ReindexAction::FullReindex));
updated_configs.insert(name, (setting, RoaringBitmap::new()));
}
}
}
let mut free_indices: [bool; u8::MAX as usize] = [true; u8::MAX as usize];
for res in self.index.embedder_category_id.iter(self.wtxn)? {
let (_name, id) = res?;
free_indices[id as usize] = false;
}
let mut free_indices = free_indices.iter_mut().enumerate();
let mut find_free_index =
move || free_indices.find(|(_, free)| **free).map(|(index, _)| index as u8);
for (name, action) in embedder_actions.iter() {
match action {
EmbedderAction::Reindex(ReindexAction::RegeneratePrompts) => {
/* cannot be a new embedder, so has to have an id already */
}
EmbedderAction::Reindex(ReindexAction::FullReindex) => {
if self.index.embedder_category_id.get(self.wtxn, name)?.is_none() {
let id = find_free_index()
.ok_or(UserError::TooManyEmbedders(updated_configs.len()))?;
tracing::debug!(embedder = name, id, "assigning free id to new embedder");
self.index.embedder_category_id.put(self.wtxn, name, &id)?;
}
}
EmbedderAction::WriteBackToDocuments(_) => { /* already removed */ }
}
}
let updated_configs: Vec<IndexEmbeddingConfig> = updated_configs
.into_iter()
.filter_map(|(name, (config, user_provided))| match config {
Setting::Set(config) => {
Some(IndexEmbeddingConfig { name, config: config.into(), user_provided })
}
Setting::Reset => None,
Setting::NotSet => Some(IndexEmbeddingConfig {
name,
config: EmbeddingSettings::default().into(),
user_provided,
}),
})
.collect();
if updated_configs.is_empty() {
self.index.delete_embedding_configs(self.wtxn)?;
} else {
self.index.put_embedding_configs(self.wtxn, updated_configs)?;
}
Ok(embedder_actions)
} }
fn update_search_cutoff(&mut self) -> Result<bool> { fn update_search_cutoff(&mut self) -> Result<bool> {
@ -1082,13 +1151,8 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
self.update_searchable()?; self.update_searchable()?;
self.update_exact_attributes()?; self.update_exact_attributes()?;
self.update_proximity_precision()?; self.update_proximity_precision()?;
// TODO: very rough approximation of the needs for reindexing where any change will result in
// a full reindexing. let embedding_config_updates = self.update_embedding_configs()?;
// What can be done instead:
// 1. Only change the distance on a distance change
// 2. Only change the name -> embedder mapping on a name change
// 3. Keep the old vectors but reattempt indexing on a prompt change: only actually changed prompt will need embedding + storage
let embedding_configs_updated = self.update_embedding_configs()?;
let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?; let mut new_inner_settings = InnerIndexSettings::from_index(self.index, self.wtxn)?;
new_inner_settings.recompute_facets(self.wtxn, self.index)?; new_inner_settings.recompute_facets(self.wtxn, self.index)?;
@ -1102,7 +1166,7 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
old_inner_settings, old_inner_settings,
new_inner_settings, new_inner_settings,
primary_key_id, primary_key_id,
embedding_configs_updated, embedding_config_updates,
settings_update_only, settings_update_only,
); );
@ -1119,7 +1183,7 @@ pub struct InnerIndexSettingsDiff {
pub(crate) new: InnerIndexSettings, pub(crate) new: InnerIndexSettings,
pub(crate) primary_key_id: Option<FieldId>, pub(crate) primary_key_id: Option<FieldId>,
// TODO: compare directly the embedders. // TODO: compare directly the embedders.
pub(crate) embedding_configs_updated: bool, pub(crate) embedding_config_updates: BTreeMap<String, EmbedderAction>,
pub(crate) settings_update_only: bool, pub(crate) settings_update_only: bool,
/// The set of only the additional searchable fields. /// The set of only the additional searchable fields.
/// If any other searchable field has been modified, is set to None. /// If any other searchable field has been modified, is set to None.
@ -1140,7 +1204,7 @@ impl InnerIndexSettingsDiff {
old_settings: InnerIndexSettings, old_settings: InnerIndexSettings,
new_settings: InnerIndexSettings, new_settings: InnerIndexSettings,
primary_key_id: Option<FieldId>, primary_key_id: Option<FieldId>,
embedding_configs_updated: bool, embedding_config_updates: BTreeMap<String, EmbedderAction>,
settings_update_only: bool, settings_update_only: bool,
) -> Self { ) -> Self {
let only_additional_fields = match ( let only_additional_fields = match (
@ -1177,7 +1241,7 @@ impl InnerIndexSettingsDiff {
old: old_settings, old: old_settings,
new: new_settings, new: new_settings,
primary_key_id, primary_key_id,
embedding_configs_updated, embedding_config_updates,
settings_update_only, settings_update_only,
only_additional_fields, only_additional_fields,
cache_reindex_searchable_without_user_defined, cache_reindex_searchable_without_user_defined,
@ -1244,7 +1308,7 @@ impl InnerIndexSettingsDiff {
} }
pub fn reindex_vectors(&self) -> bool { pub fn reindex_vectors(&self) -> bool {
self.embedding_configs_updated !self.embedding_config_updates.is_empty()
} }
pub fn settings_update_only(&self) -> bool { pub fn settings_update_only(&self) -> bool {