mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-05-14 08:14:05 +02:00
Merge pull request #5478 from meilisearch/enforce-embedding-dimensions
Enforce embedding dimensions
This commit is contained in:
commit
e36a8c50b9
@ -454,7 +454,10 @@ impl ErrorCode for milli::Error {
|
|||||||
}
|
}
|
||||||
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
UserError::CriterionError(_) => Code::InvalidSettingsRankingRules,
|
||||||
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField,
|
||||||
UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions,
|
UserError::InvalidVectorDimensions { .. }
|
||||||
|
| UserError::InvalidIndexingVectorDimensions { .. } => {
|
||||||
|
Code::InvalidVectorDimensions
|
||||||
|
}
|
||||||
UserError::InvalidVectorsMapType { .. }
|
UserError::InvalidVectorsMapType { .. }
|
||||||
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
|
| UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType,
|
||||||
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
UserError::TooManyVectors(_, _) => Code::TooManyVectors,
|
||||||
|
@ -164,6 +164,87 @@ async fn add_remove_user_provided() {
|
|||||||
"###);
|
"###);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn user_provide_mismatched_embedding_dimension() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("doggo");
|
||||||
|
|
||||||
|
let (response, code) = index
|
||||||
|
.update_settings(json!({
|
||||||
|
"embedders": {
|
||||||
|
"manual": {
|
||||||
|
"source": "userProvided",
|
||||||
|
"dimensions": 3,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
.await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
server.wait_task(response.uid()).await.succeeded();
|
||||||
|
|
||||||
|
let documents = json!([
|
||||||
|
{"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }},
|
||||||
|
]);
|
||||||
|
let (value, code) = index.add_documents(documents, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(value.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": "[uid]",
|
||||||
|
"batchUid": "[batch_uid]",
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
|
||||||
|
"code": "invalid_vector_dimensions",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
|
||||||
|
let new_document = json!([
|
||||||
|
{"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }},
|
||||||
|
]);
|
||||||
|
let (response, code) = index.add_documents(new_document, None).await;
|
||||||
|
snapshot!(code, @"202 Accepted");
|
||||||
|
let task = index.wait_task(response.uid()).await;
|
||||||
|
snapshot!(task, @r###"
|
||||||
|
{
|
||||||
|
"uid": "[uid]",
|
||||||
|
"batchUid": "[batch_uid]",
|
||||||
|
"indexUid": "doggo",
|
||||||
|
"status": "failed",
|
||||||
|
"type": "documentAdditionOrUpdate",
|
||||||
|
"canceledBy": null,
|
||||||
|
"details": {
|
||||||
|
"receivedDocuments": 1,
|
||||||
|
"indexedDocuments": 0
|
||||||
|
},
|
||||||
|
"error": {
|
||||||
|
"message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3",
|
||||||
|
"code": "invalid_vector_dimensions",
|
||||||
|
"type": "invalid_request",
|
||||||
|
"link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions"
|
||||||
|
},
|
||||||
|
"duration": "[duration]",
|
||||||
|
"enqueuedAt": "[date]",
|
||||||
|
"startedAt": "[date]",
|
||||||
|
"finishedAt": "[date]"
|
||||||
|
}
|
||||||
|
"###);
|
||||||
|
}
|
||||||
|
|
||||||
async fn generate_default_user_provided_documents(server: &Server) -> Index {
|
async fn generate_default_user_provided_documents(server: &Server) -> Index {
|
||||||
let index = server.index("doggo");
|
let index = server.index("doggo");
|
||||||
|
|
||||||
|
@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string()
|
|||||||
InvalidGeoField(#[from] GeoError),
|
InvalidGeoField(#[from] GeoError),
|
||||||
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
#[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)]
|
||||||
InvalidVectorDimensions { expected: usize, found: usize },
|
InvalidVectorDimensions { expected: usize, found: usize },
|
||||||
|
#[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")]
|
||||||
|
InvalidIndexingVectorDimensions {
|
||||||
|
embedder_name: String,
|
||||||
|
document_id: String,
|
||||||
|
embedding_index: usize,
|
||||||
|
expected: usize,
|
||||||
|
found: usize,
|
||||||
|
},
|
||||||
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
#[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")]
|
||||||
InvalidVectorsMapType { document_id: String, value: Value },
|
InvalidVectorsMapType { document_id: String, value: Value },
|
||||||
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
#[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")]
|
||||||
|
@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
// do we have set embeddings?
|
// do we have set embeddings?
|
||||||
if let Some(embeddings) = new_vectors.embeddings {
|
if let Some(embeddings) = new_vectors.embeddings {
|
||||||
chunks.set_vectors(
|
chunks.set_vectors(
|
||||||
|
update.external_document_id(),
|
||||||
update.docid(),
|
update.docid(),
|
||||||
embeddings
|
embeddings
|
||||||
.into_vec(&context.doc_alloc, embedder_name)
|
.into_vec(&context.doc_alloc, embedder_name)
|
||||||
@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
document_id: update.external_document_id().to_string(),
|
document_id: update.external_document_id().to_string(),
|
||||||
error: error.to_string(),
|
error: error.to_string(),
|
||||||
})?,
|
})?,
|
||||||
);
|
)?;
|
||||||
} else if new_vectors.regenerate {
|
} else if new_vectors.regenerate {
|
||||||
let new_rendered = prompt.render_document(
|
let new_rendered = prompt.render_document(
|
||||||
update.external_document_id(),
|
update.external_document_id(),
|
||||||
@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
chunks.set_regenerate(insertion.docid(), new_vectors.regenerate);
|
||||||
if let Some(embeddings) = new_vectors.embeddings {
|
if let Some(embeddings) = new_vectors.embeddings {
|
||||||
chunks.set_vectors(
|
chunks.set_vectors(
|
||||||
|
insertion.external_document_id(),
|
||||||
insertion.docid(),
|
insertion.docid(),
|
||||||
embeddings
|
embeddings
|
||||||
.into_vec(&context.doc_alloc, embedder_name)
|
.into_vec(&context.doc_alloc, embedder_name)
|
||||||
@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> {
|
|||||||
.to_string(),
|
.to_string(),
|
||||||
error: error.to_string(),
|
error: error.to_string(),
|
||||||
})?,
|
})?,
|
||||||
);
|
)?;
|
||||||
} else if new_vectors.regenerate {
|
} else if new_vectors.regenerate {
|
||||||
let rendered = prompt.render_document(
|
let rendered = prompt.render_document(
|
||||||
insertion.external_document_id(),
|
insertion.external_document_id(),
|
||||||
@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> {
|
|||||||
embedder: &'a Embedder,
|
embedder: &'a Embedder,
|
||||||
embedder_id: u8,
|
embedder_id: u8,
|
||||||
embedder_name: &'a str,
|
embedder_name: &'a str,
|
||||||
|
dimensions: usize,
|
||||||
prompt: &'a Prompt,
|
prompt: &'a Prompt,
|
||||||
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
possible_embedding_mistakes: &'a PossibleEmbeddingMistakes,
|
||||||
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
user_provided: &'a RefCell<EmbeddingExtractorData<'extractor>>,
|
||||||
@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint();
|
||||||
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
let texts = BVec::with_capacity_in(capacity, doc_alloc);
|
||||||
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
let ids = BVec::with_capacity_in(capacity, doc_alloc);
|
||||||
|
let dimensions = embedder.dimensions();
|
||||||
Self {
|
Self {
|
||||||
texts,
|
texts,
|
||||||
ids,
|
ids,
|
||||||
@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
embedder_name,
|
embedder_name,
|
||||||
user_provided,
|
user_provided,
|
||||||
has_manual_generation: None,
|
has_manual_generation: None,
|
||||||
|
dimensions,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn set_vectors(&self, docid: DocumentId, embeddings: Vec<Embedding>) {
|
fn set_vectors(
|
||||||
|
&self,
|
||||||
|
external_docid: &'a str,
|
||||||
|
docid: DocumentId,
|
||||||
|
embeddings: Vec<Embedding>,
|
||||||
|
) -> Result<()> {
|
||||||
|
for (embedding_index, embedding) in embeddings.iter().enumerate() {
|
||||||
|
if embedding.len() != self.dimensions {
|
||||||
|
return Err(UserError::InvalidIndexingVectorDimensions {
|
||||||
|
expected: self.dimensions,
|
||||||
|
found: embedding.len(),
|
||||||
|
embedder_name: self.embedder_name.to_string(),
|
||||||
|
document_id: external_docid.to_string(),
|
||||||
|
embedding_index,
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
}
|
||||||
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap();
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user