Fix vector error messages

This commit is contained in:
Louis Dureuil 2024-11-12 23:26:16 +01:00
parent bfdcd1cf33
commit 3b0cb5b487
No known key found for this signature in database
2 changed files with 54 additions and 1 deletions

View File

@ -151,6 +151,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
if new_rendered != old_rendered {
chunks.set_autogenerated(
update.docid(),
update.external_document_id(),
new_rendered,
&unused_vectors_distribution,
)?;
@ -178,6 +179,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
if new_rendered != old_rendered {
chunks.set_autogenerated(
update.docid(),
update.external_document_id(),
new_rendered,
&unused_vectors_distribution,
)?;
@ -221,6 +223,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
)?;
chunks.set_autogenerated(
insertion.docid(),
insertion.external_document_id(),
rendered,
&unused_vectors_distribution,
)?;
@ -233,6 +236,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
)?;
chunks.set_autogenerated(
insertion.docid(),
insertion.external_document_id(),
rendered,
&unused_vectors_distribution,
)?;
@ -268,6 +272,7 @@ struct Chunks<'a, 'extractor> {
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
threads: &'a ThreadPoolNoAbort,
sender: &'a EmbeddingSender<'a>,
has_manual_generation: Option<&'a str>,
}
impl<'a, 'extractor> Chunks<'a, 'extractor> {
@ -297,15 +302,22 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
embedder_id,
embedder_name,
user_provided,
has_manual_generation: None,
}
}
pub fn set_autogenerated(
&mut self,
docid: DocumentId,
external_docid: &'a str,
rendered: &'a str,
unused_vectors_distribution: &UnusedVectorsDistributionBump,
) -> Result<()> {
let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_));
if is_manual {
self.has_manual_generation.get_or_insert(external_docid);
}
if self.texts.len() < self.texts.capacity() {
self.texts.push(rendered);
self.ids.push(docid);
@ -322,6 +334,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
unused_vectors_distribution,
self.threads,
self.sender,
self.has_manual_generation.take(),
)
}
@ -339,6 +352,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
unused_vectors_distribution,
self.threads,
self.sender,
self.has_manual_generation,
);
// optimization: don't run bvec dtors as they only contain bumpalo allocated stuff
std::mem::forget(self);
@ -356,7 +370,46 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
unused_vectors_distribution: &UnusedVectorsDistributionBump,
threads: &ThreadPoolNoAbort,
sender: &EmbeddingSender<'a>,
has_manual_generation: Option<&'a str>,
) -> Result<()> {
if let Some(external_docid) = has_manual_generation {
let mut msg = format!(
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}",
external_docid,
if ids.len() > 1 {
format!(" and at least {} other document(s)", ids.len() - 1)
} else {
"".to_string()
}
);
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
let mut hint_count = 0;
for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2)
{
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
hint_count += 1;
}
for (embedder_misspelling, count) in possible_embedding_mistakes
.embedder_mistakes_bump(embedder_name, unused_vectors_distribution)
.take(2)
{
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
hint_count += 1;
}
if hint_count == 0 {
msg += &format!(
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
);
}
return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)));
}
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
Ok(embeddings) => {
for (docid, embedding) in ids.into_iter().zip(embeddings) {

View File

@ -648,7 +648,7 @@ impl Embedder {
Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
Embedder::OpenAi(embedder) => embedder.chunk_count_hint(),
Embedder::Ollama(embedder) => embedder.chunk_count_hint(),
Embedder::UserProvided(_) => 1,
Embedder::UserProvided(_) => 100,
Embedder::Rest(embedder) => embedder.chunk_count_hint(),
}
}