Check consistency of fragments

This commit is contained in:
Louis Dureuil 2025-07-02 11:35:01 +02:00
parent d72e5f5f69
commit 3f5b5df139
No known key found for this signature in database
3 changed files with 58 additions and 20 deletions

View file

@ -501,8 +501,11 @@ impl Settings<Unchecked> {
let Setting::Set(mut configs) = self.embedders else { return Ok(self) }; let Setting::Set(mut configs) = self.embedders else { return Ok(self) };
for (name, config) in configs.iter_mut() { for (name, config) in configs.iter_mut() {
let config_to_check = std::mem::take(config); let config_to_check = std::mem::take(config);
let checked_config = let checked_config = milli::update::validate_embedding_settings(
milli::update::validate_embedding_settings(config_to_check.inner, name)?; config_to_check.inner,
name,
milli::vector::settings::EmbeddingValidationContext::SettingsPartialUpdate,
)?;
*config = SettingEmbeddingSettings { inner: checked_config }; *config = SettingEmbeddingSettings { inner: checked_config };
} }
self.embedders = Setting::Set(configs); self.embedders = Setting::Set(configs);

View file

@ -35,8 +35,8 @@ use crate::update::{IndexDocuments, UpdateIndexingStep};
use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig}; use crate::vector::db::{FragmentConfigs, IndexEmbeddingConfig};
use crate::vector::json_template::JsonTemplate; use crate::vector::json_template::JsonTemplate;
use crate::vector::settings::{ use crate::vector::settings::{
EmbedderAction, EmbedderSource, EmbeddingSettings, NestingContext, ReindexAction, EmbedderAction, EmbedderSource, EmbeddingSettings, EmbeddingValidationContext, NestingContext,
SubEmbeddingSettings, WriteBackToDocuments, ReindexAction, SubEmbeddingSettings, WriteBackToDocuments,
}; };
use crate::vector::{ use crate::vector::{
Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment, Embedder, EmbeddingConfig, RuntimeEmbedder, RuntimeEmbedders, RuntimeFragment,
@ -1181,13 +1181,20 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
}; };
embedder_actions.insert(name.clone(), embedder_action); embedder_actions.insert(name.clone(), embedder_action);
let new = validate_embedding_settings(updated_settings, &name)?; let new = validate_embedding_settings(
updated_settings,
&name,
EmbeddingValidationContext::FullSettings,
)?;
updated_configs.insert(name, (new, fragments)); updated_configs.insert(name, (new, fragments));
} }
SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => { SettingsDiff::UpdateWithoutReindex { updated_settings, quantize } => {
tracing::debug!(embedder = name, "update without reindex embedder"); tracing::debug!(embedder = name, "update without reindex embedder");
let new = let new = validate_embedding_settings(
validate_embedding_settings(Setting::Set(updated_settings), &name)?; Setting::Set(updated_settings),
&name,
EmbeddingValidationContext::FullSettings,
)?;
if quantize { if quantize {
embedder_actions.insert( embedder_actions.insert(
name.clone(), name.clone(),
@ -1211,7 +1218,11 @@ impl<'a, 't, 'i> Settings<'a, 't, 'i> {
crate::vector::settings::EmbeddingSettings::apply_default_openai_model( crate::vector::settings::EmbeddingSettings::apply_default_openai_model(
&mut setting, &mut setting,
); );
let setting = validate_embedding_settings(setting, &name)?; let setting = validate_embedding_settings(
setting,
&name,
EmbeddingValidationContext::FullSettings,
)?;
embedder_actions.insert( embedder_actions.insert(
name.clone(), name.clone(),
EmbedderAction::with_reindex(ReindexAction::FullReindex, false), EmbedderAction::with_reindex(ReindexAction::FullReindex, false),
@ -2079,6 +2090,7 @@ fn validate_prompt(
pub fn validate_embedding_settings( pub fn validate_embedding_settings(
settings: Setting<EmbeddingSettings>, settings: Setting<EmbeddingSettings>,
name: &str, name: &str,
context: EmbeddingValidationContext,
) -> Result<Setting<EmbeddingSettings>> { ) -> Result<Setting<EmbeddingSettings>> {
let Setting::Set(settings) = settings else { return Ok(settings) }; let Setting::Set(settings) = settings else { return Ok(settings) };
let EmbeddingSettings { let EmbeddingSettings {
@ -2119,10 +2131,10 @@ pub fn validate_embedding_settings(
})?; })?;
} }
if let Some(request) = request.as_ref().set() { // if we are working with partial settings, the user could have changed only the `request` and not given again the fragments
let request = crate::vector::rest::RequestData::new( if context == EmbeddingValidationContext::FullSettings {
request.to_owned(), if let Some(request) = request.as_ref().set() {
indexing_fragments let indexing_fragments: BTreeMap<_, _> = indexing_fragments
.as_ref() .as_ref()
.set() .set()
.iter() .iter()
@ -2130,8 +2142,8 @@ pub fn validate_embedding_settings(
.filter_map(|(name, fragment)| { .filter_map(|(name, fragment)| {
Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?))
}) })
.collect(), .collect();
search_fragments let search_fragments: BTreeMap<_, _> = search_fragments
.as_ref() .as_ref()
.set() .set()
.iter() .iter()
@ -2139,12 +2151,29 @@ pub fn validate_embedding_settings(
.filter_map(|(name, fragment)| { .filter_map(|(name, fragment)| {
Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?)) Some((name.clone(), fragment.as_ref().map(|fragment| fragment.value.clone())?))
}) })
.collect(), .collect();
)
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; let are_fragments_inconsistent =
if let Some(response) = response.as_ref().set() { indexing_fragments.is_empty() ^ search_fragments.is_empty();
crate::vector::rest::Response::new(response.to_owned(), &request) if are_fragments_inconsistent {
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?; return Err(crate::vector::error::NewEmbedderError::rest_inconsistent_fragments(
indexing_fragments.is_empty(),
indexing_fragments,
search_fragments,
))
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()).into());
}
let request = crate::vector::rest::RequestData::new(
request.to_owned(),
indexing_fragments,
search_fragments,
)
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?;
if let Some(response) = response.as_ref().set() {
crate::vector::rest::Response::new(response.to_owned(), &request)
.map_err(|error| crate::UserError::VectorEmbeddingError(error.into()))?;
}
} }
} }

View file

@ -615,6 +615,12 @@ pub struct SubEmbeddingSettings {
pub indexing_embedder: Setting<serde_json::Value>, pub indexing_embedder: Setting<serde_json::Value>,
} }
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub enum EmbeddingValidationContext {
FullSettings,
SettingsPartialUpdate,
}
/// Indicates what action should take place during a reindexing operation for an embedder /// Indicates what action should take place during a reindexing operation for an embedder
#[derive(Debug, Clone, PartialEq, Eq, Hash)] #[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub enum ReindexAction { pub enum ReindexAction {