mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-05-25 09:03:59 +02:00
Check dimension mismatch at insertion time
This commit is contained in:
parent
94ea263bef
commit
f729864466
@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
// do we have set embeddings?
|
// do we have set embeddings?
|
||||||
if let Some(embeddings) = new_vectors.embeddings {
|
if let Some(embeddings) = new_vectors.embeddings {
|
||||||
chunks.set_vectors(
|
chunks.set_vectors(
|
||||||
|
update.external_document_id(),
|
||||||
update.docid(),
|
update.docid(),
|
||||||
embeddings
|
embeddings
|
||||||
.into_vec(&context.doc_alloc, embedder_name)
|
.into_vec(&context.doc_alloc, embedder_name)
|
||||||
@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
document_id: update.external_document_id().to_string(),
|
document_id: update.external_document_id().to_string(),
|
||||||
error: error.to_string(),
|
error: error.to_string(),
|
||||||
})?,
|
})?,
|
||||||
);
|
)?;
|
||||||
} else if new_vectors.regenerate {
|
} else if new_vectors.regenerate {
|
||||||
let new_rendered = prompt.render_document(
|
let new_rendered = prompt.render_document(
|
||||||
update.external_document_id(),
|
update.external_document_id(),
|
||||||
@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
||||||
if let Some(embeddings) = new_vectors.embeddings {
|
if let Some(embeddings) = new_vectors.embeddings {
|
||||||
chunks.set_vectors(
|
chunks.set_vectors(
|
||||||
|
insertion.external_document_id(),
|
||||||
insertion.docid(),
|
insertion.docid(),
|
||||||
embeddings
|
embeddings
|
||||||
.into_vec(&context.doc_alloc, embedder_name)
|
.into_vec(&context.doc_alloc, embedder_name)
|
||||||
@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
.to_string(),
|
.to_string(),
|
||||||
error: error.to_string(),
|
error: error.to_string(),
|
||||||
})?,
|
})?,
|
||||||
);
|
)?;
|
||||||
} else if new_vectors.regenerate {
|
} else if new_vectors.regenerate {
|
||||||
let rendered = prompt.render_document(
|
let rendered = prompt.render_document(
|
||||||
insertion.external_document_id(),
|
insertion.external_document_id(),
|
||||||
@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
|
|||||||
embedder: &'a Embedder,
|
embedder: &'a Embedder,
|
||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embedder_name: &'a str,
|
embedder_name: &'a str,
|
||||||
|
dimensions: usize,
|
||||||
prompt: &'a Prompt,
|
prompt: &'a Prompt,
|
||||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||||
@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||||
|
let dimensions = embedder.dimensions();
|
||||||
Self {
|
Self {
|
||||||
texts,
|
texts,
|
||||||
ids,
|
ids,
|
||||||
@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
embedder_name,
|
embedder_name,
|
||||||
user_provided,
|
user_provided,
|
||||||
has_manual_generation: None,
|
has_manual_generation: None,
|
||||||
|
dimensions,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
|
fn set_vectors(
|
||||||
|
&self,
|
||||||
|
external_docid: &'a str,
|
||||||
|
docid: DocumentId,
|
||||||
|
embeddings: Vec<Embedding>,
|
||||||
|
) -> Result<()> {
|
||||||
|
for (embedding_index, embedding) in embeddings.iter().enumerate() {
|
||||||
|
if embedding.len() != self.dimensions {
|
||||||
|
return Err(UserError::InvalidIndexingVectorDimensions {
|
||||||
|
expected: self.dimensions,
|
||||||
|
found: embedding.len(),
|
||||||
|
embedder_name: self.embedder_name.to_string(),
|
||||||
|
document_id: external_docid.to_string(),
|
||||||
|
embedding_index,
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user