mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-15 09:28:55 +01:00
Fix vector error messages
This commit is contained in:
parent
bfdcd1cf33
commit
3b0cb5b487
@ -151,6 +151,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
||||
if new_rendered != old_rendered {
|
||||
chunks.set_autogenerated(
|
||||
update.docid(),
|
||||
update.external_document_id(),
|
||||
new_rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
@ -178,6 +179,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
||||
if new_rendered != old_rendered {
|
||||
chunks.set_autogenerated(
|
||||
update.docid(),
|
||||
update.external_document_id(),
|
||||
new_rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
@ -221,6 +223,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
||||
)?;
|
||||
chunks.set_autogenerated(
|
||||
insertion.docid(),
|
||||
insertion.external_document_id(),
|
||||
rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
@ -233,6 +236,7 @@ impl<'a, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a> {
|
||||
)?;
|
||||
chunks.set_autogenerated(
|
||||
insertion.docid(),
|
||||
insertion.external_document_id(),
|
||||
rendered,
|
||||
&unused_vectors_distribution,
|
||||
)?;
|
||||
@ -268,6 +272,7 @@ struct Chunks<'a, 'extractor> {
|
||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||
threads: &'a ThreadPoolNoAbort,
|
||||
sender: &'a EmbeddingSender<'a>,
|
||||
has_manual_generation: Option<&'a str>,
|
||||
}
|
||||
|
||||
impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
||||
@ -297,15 +302,22 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
||||
embedder_id,
|
||||
embedder_name,
|
||||
user_provided,
|
||||
has_manual_generation: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn set_autogenerated(
|
||||
&mut self,
|
||||
docid: DocumentId,
|
||||
external_docid: &'a str,
|
||||
rendered: &'a str,
|
||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||
) -> Result<()> {
|
||||
let is_manual = matches!(&self.embedder, &Embedder::UserProvided(_));
|
||||
if is_manual {
|
||||
self.has_manual_generation.get_or_insert(external_docid);
|
||||
}
|
||||
|
||||
if self.texts.len() < self.texts.capacity() {
|
||||
self.texts.push(rendered);
|
||||
self.ids.push(docid);
|
||||
@ -322,6 +334,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
||||
unused_vectors_distribution,
|
||||
self.threads,
|
||||
self.sender,
|
||||
self.has_manual_generation.take(),
|
||||
)
|
||||
}
|
||||
|
||||
@ -339,6 +352,7 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
||||
unused_vectors_distribution,
|
||||
self.threads,
|
||||
self.sender,
|
||||
self.has_manual_generation,
|
||||
);
|
||||
// optimization: don't run bvec dtors as they only contain bumpalo allocated stuff
|
||||
std::mem::forget(self);
|
||||
@ -356,7 +370,46 @@ impl<'a, 'extractor> Chunks<'a, 'extractor> {
|
||||
unused_vectors_distribution: &UnusedVectorsDistributionBump,
|
||||
threads: &ThreadPoolNoAbort,
|
||||
sender: &EmbeddingSender<'a>,
|
||||
has_manual_generation: Option<&'a str>,
|
||||
) -> Result<()> {
|
||||
if let Some(external_docid) = has_manual_generation {
|
||||
let mut msg = format!(
|
||||
r"While embedding documents for embedder `{embedder_name}`: no vectors provided for document `{}`{}",
|
||||
external_docid,
|
||||
if ids.len() > 1 {
|
||||
format!(" and at least {} other document(s)", ids.len() - 1)
|
||||
} else {
|
||||
"".to_string()
|
||||
}
|
||||
);
|
||||
|
||||
msg += &format!("\n- Note: `{embedder_name}` has `source: userProvided`, so documents must provide embeddings as an array in `_vectors.{embedder_name}`.");
|
||||
|
||||
let mut hint_count = 0;
|
||||
|
||||
for (vector_misspelling, count) in possible_embedding_mistakes.vector_mistakes().take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `{vector_misspelling}` by `_vectors` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
for (embedder_misspelling, count) in possible_embedding_mistakes
|
||||
.embedder_mistakes_bump(embedder_name, unused_vectors_distribution)
|
||||
.take(2)
|
||||
{
|
||||
msg += &format!("\n- Hint: try replacing `_vectors.{embedder_misspelling}` by `_vectors.{embedder_name}` in {count} document(s).");
|
||||
hint_count += 1;
|
||||
}
|
||||
|
||||
if hint_count == 0 {
|
||||
msg += &format!(
|
||||
"\n- Hint: opt-out for a document with `_vectors.{embedder_name}: null`"
|
||||
);
|
||||
}
|
||||
|
||||
return Err(crate::Error::UserError(crate::UserError::DocumentEmbeddingError(msg)));
|
||||
}
|
||||
|
||||
let res = match embedder.embed_chunks_ref(texts.as_slice(), threads) {
|
||||
Ok(embeddings) => {
|
||||
for (docid, embedding) in ids.into_iter().zip(embeddings) {
|
||||
|
@ -648,7 +648,7 @@ impl Embedder {
|
||||
Embedder::HuggingFace(embedder) => embedder.chunk_count_hint(),
|
||||
Embedder::OpenAi(embedder) => embedder.chunk_count_hint(),
|
||||
Embedder::Ollama(embedder) => embedder.chunk_count_hint(),
|
||||
Embedder::UserProvided(_) => 1,
|
||||
Embedder::UserProvided(_) => 100,
|
||||
Embedder::Rest(embedder) => embedder.chunk_count_hint(),
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user