From 2413592bbf9ea5c18161fc686d839a7b17446cac Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 22 Jul 2024 15:41:12 +0200 Subject: [PATCH] Display docid when there are documents without manual embeddings for a manual embedder --- .../extract/extract_vector_points.rs | 124 +++++++++++++++++- .../src/update/index_documents/extract/mod.rs | 1 + 2 files changed, 123 insertions(+), 2 deletions(-) diff --git a/milli/src/update/index_documents/extract/extract_vector_points.rs b/milli/src/update/index_documents/extract/extract_vector_points.rs index b984c3020..a7515282b 100644 --- a/milli/src/update/index_documents/extract/extract_vector_points.rs +++ b/milli/src/update/index_documents/extract/extract_vector_points.rs @@ -95,6 +95,84 @@ enum ExtractionAction { DocumentOperation(DocumentOperation), } +struct ManualEmbedderErrors { + embedder_name: String, + docid: String, + other_docids: usize, +} + +impl ManualEmbedderErrors { + pub fn push_error( + errors: &mut Option, + embedder_name: &str, + document_id: impl Fn() -> Value, + ) { + match errors { + Some(errors) => { + if errors.embedder_name == embedder_name { + errors.other_docids = errors.other_docids.saturating_add(1) + } + } + None => { + *errors = Some(Self { + embedder_name: embedder_name.to_owned(), + docid: document_id().to_string(), + other_docids: 0, + }); + } + } + } + + pub fn to_result( + errors: Option, + possible_embedding_mistakes: &PossibleEmbeddingMistakes, + unused_vectors_distribution: &UnusedVectorsDistribution, + ) -> Result<()> { + match errors { + Some(errors) => { + let embedder_name = &errors.embedder_name; + let mut msg = format!( + r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document {}{}", + errors.docid, + if errors.other_docids != 0 { + format!(" and at least {} other document(s)", errors.other_docids) + } else { + "".to_string() + } + ); + + msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + + let mut hint_count = 0; + + for (vector_misspelling, count) in + possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in possible_embedding_mistakes + .embedder_mistakes(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } + + if hint_count == 0 { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" + ); + } + + Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))) + } + None => Ok(()), + } + } +} + /// Extracts the embedding vector contained in each document under the `_vectors` field. /// /// Returns the generated grenad reader containing the docid as key associated to the Vec @@ -104,8 +182,10 @@ pub fn extract_vector_points( indexer: GrenadParameters, embedders_configs: &[IndexEmbeddingConfig], settings_diff: &InnerIndexSettingsDiff, + possible_embedding_mistakes: &PossibleEmbeddingMistakes, ) -> Result<(Vec, UnusedVectorsDistribution)> { let mut unused_vectors_distribution = UnusedVectorsDistribution::new(); + let mut manual_errors = None; let reindex_vectors = settings_diff.reindex_vectors(); let old_fields_ids_map = &settings_diff.old.fields_ids_map; @@ -246,7 +326,7 @@ pub fn extract_vector_points( for EmbedderVectorExtractor { embedder_name, - embedder: _, + embedder, prompt, prompts_writer, remove_vectors_writer, @@ -255,6 +335,8 @@ pub fn extract_vector_points( action, } in extractors.iter_mut() { + let embedder_is_manual = matches!(**embedder, Embedder::UserProvided(_)); + let (old, new) = parsed_vectors.remove(embedder_name); let delta = match action { ExtractionAction::SettingsFullReindex => match old { @@ -285,11 +367,29 @@ pub fn extract_vector_points( // this happens only when an existing embedder changed. We cannot regenerate userProvided vectors VectorState::Manual => VectorStateDelta::NoChange, // generated vectors must be regenerated - VectorState::Generated => regenerate_prompt(obkv, prompt, new_fields_ids_map)?, + VectorState::Generated => { + if embedder_is_manual { + ManualEmbedderErrors::push_error( + &mut manual_errors, + embedder_name.as_str(), + document_id, + ); + continue; + } + regenerate_prompt(obkv, prompt, new_fields_ids_map)? + } }, // prompt regeneration is only triggered for existing embedders ExtractionAction::SettingsRegeneratePrompts { old_prompt } => { if old.must_regenerate() { + if embedder_is_manual { + ManualEmbedderErrors::push_error( + &mut manual_errors, + embedder_name.as_str(), + document_id, + ); + continue; + } regenerate_if_prompt_changed( obkv, (old_prompt, prompt), @@ -311,6 +411,9 @@ pub fn extract_vector_points( (old, new), (old_fields_ids_map, new_fields_ids_map), document_id, + embedder_name, + embedder_is_manual, + &mut manual_errors, )?, }; // and we finally push the unique vectors into the writer @@ -326,6 +429,12 @@ pub fn extract_vector_points( unused_vectors_distribution.append(parsed_vectors); } + ManualEmbedderErrors::to_result( + manual_errors, + possible_embedding_mistakes, + &unused_vectors_distribution, + )?; + let mut results = Vec::new(); for EmbedderVectorExtractor { @@ -371,6 +480,9 @@ fn extract_vector_document_diff( (old, new): (VectorState, VectorState), (old_fields_ids_map, new_fields_ids_map): (&FieldsIdsMap, &FieldsIdsMap), document_id: impl Fn() -> Value, + embedder_name: &str, + embedder_is_manual: bool, + manual_errors: &mut Option, ) -> Result { match (old.must_regenerate(), new.must_regenerate()) { (true, true) | (false, false) => {} @@ -408,6 +520,10 @@ fn extract_vector_document_diff( .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { + if embedder_is_manual { + ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); + return Ok(VectorStateDelta::NoChange); + } // Don't give up if the old prompt was failing let old_prompt = Some(&prompt).map(|p| { p.render(obkv, DelAdd::Deletion, old_fields_ids_map).unwrap_or_default() @@ -439,6 +555,10 @@ fn extract_vector_document_diff( .map(|(_, deladd)| KvReaderDelAdd::new(deladd)) .any(|deladd| deladd.get(DelAdd::Addition).is_some()); if document_is_kept { + if embedder_is_manual { + ManualEmbedderErrors::push_error(manual_errors, embedder_name, document_id); + return Ok(VectorStateDelta::NoChange); + } // becomes autogenerated VectorStateDelta::NowGenerated(prompt.render( obkv, diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 6c23a8da9..cab84400c 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -251,6 +251,7 @@ fn send_original_documents_data( indexer, &embedders_configs, &settings_diff, + &possible_embedding_mistakes, ) { Ok((extracted_vectors, unused_vectors_distribution)) => { for ExtractedVectorPoints {