From 3b0cb5b48738e24543f9c4810b5614597161dc00 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Tue, 12 Nov 2024 23:26:16 +0100 Subject: [PATCH] Fix vector error messages --- .../src/update/new/extract/vectors/mod.rs | 53 +++++++++++++++++++ crates/milli/src/vector/mod.rs | 2 +- 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index efb02b2ab..3a73ff82f 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -151,6 +151,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { if new_rendered != old_rendered { chunks.set_autogenerated( update.docid(), + update.external_document_id(), new_rendered, &unused_vectors_distribution, )?; @@ -178,6 +179,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { if new_rendered != old_rendered { chunks.set_autogenerated( update.docid(), + update.external_document_id(), new_rendered, &unused_vectors_distribution, )?; @@ -221,6 +223,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { )?; chunks.set_autogenerated( insertion.docid(), + insertion.external_document_id(), rendered, &unused_vectors_distribution, )?; @@ -233,6 +236,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> { )?; chunks.set_autogenerated( insertion.docid(), + insertion.external_document_id(), rendered, &unused_vectors_distribution, )?; @@ -268,6 +272,7 @@ struct Chunks<'a, 'extractor> { user_provided: &'a RefCell>, threads: &'a ThreadPoolNoAbort, sender: &'a EmbeddingSender<'a>, + has_manual_generation: Option<&'a str>, } impl<'a, 'extractor> Chunks<'a, 'extractor> { @@ -297,15 +302,22 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { embedder_id, embedder_name, user_provided, + has_manual_generation: None, } } pub fn set_autogenerated( &mut self, docid: DocumentId, + external_docid: &'a str, rendered: &'a str, unused_vectors_distribution: &UnusedVectorsDistributionBump, ) -> Result<()> { + let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_)); + if is_manual { + self.has_manual_generation.get_or_insert(external_docid); + } + if self.texts.len() < self.texts.capacity() { self.texts.push(rendered); self.ids.push(docid); @@ -322,6 +334,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { unused_vectors_distribution, self.threads, self.sender, + self.has_manual_generation.take(), ) } @@ -339,6 +352,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { unused_vectors_distribution, self.threads, self.sender, + self.has_manual_generation, ); // optimization: don't run bvec dtors as they only contain bumpalo allocated stuff std::mem::forget(self); @@ -356,7 +370,46 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> { unused_vectors_distribution: &UnusedVectorsDistributionBump, threads: &ThreadPoolNoAbort, sender: &EmbeddingSender<'a>, + has_manual_generation: Option<&'a str>, ) -> Result<()> { + if let Some(external_docid) = has_manual_generation { + let mut msg = format!( + r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}", + external_docid, + if ids.len() > 1 { + format!(" and at least {} other document(s)", ids.len() - 1) + } else { + "".to_string() + } + ); + + msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`."); + + let mut hint_count = 0; + + for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2) + { + msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s)."); + hint_count += 1; + } + + for (embedder_misspelling, count) in possible_embedding_mistakes + .embedder_mistakes_bump(embedder_name, unused_vectors_distribution) + .take(2) + { + msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s)."); + hint_count += 1; + } + + if hint_count == 0 { + msg += &format!( + "\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`" + ); + } + + return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg))); + } + let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) { Ok(embeddings) => { for (docid, embedding) in ids.into_iter().zip(embeddings) { diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 57da50580..24ea77541 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -648,7 +648,7 @@ impl Embedder { Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(), Embedder::OpenAi(embedder) => embedder.chunk_count_hint(), Embedder::Ollama(embedder) => embedder.chunk_count_hint(), - Embedder::UserProvided(_) => 1, + Embedder::UserProvided(_) => 100, Embedder::Rest(embedder) => embedder.chunk_count_hint(), } }