From 7707fb18dd0c9138721e7e4cfaeb96c363fe8e6c Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 12:51:36 +0100 Subject: [PATCH 1/6] add embedding with dimension mismatch test case --- crates/meilisearch/tests/vector/mod.rs | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 67da51702..c6f32ccc5 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -164,6 +164,56 @@ async fn add_remove_user_provided() { "###); } +#[actix_rt::test] +async fn user_provide_mismatched_embedding_dimension() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + async fn generate_default_user_provided_documents(server: &Server) -> Index { let index = server.index("doggo"); From 62de70b73c3f7ba7fdd62c102a8ac0edbd4de68b Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Wed, 26 Mar 2025 12:57:25 +0100 Subject: [PATCH 2/6] Document problematic case in test and acknowledge PR comment --- crates/meilisearch/tests/vector/mod.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index c6f32ccc5..fd9c314e2 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -212,6 +212,14 @@ async fn user_provide_mismatched_embedding_dimension() { "finishedAt": "[date]" } "#); + + // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass + let new_document = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, + ]); + let (value, code) = index.add_documents(new_document, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); } async fn generate_default_user_provided_documents(server: &Server) -> Index { From 0e475cb5e649fb2b4b78a263f423b6d0ca74b31e Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 27 Mar 2025 11:07:01 +0100 Subject: [PATCH 3/6] fix warn and show what meilisearch understood of the vectors in the cursed test --- crates/meilisearch/tests/vector/mod.rs | 35 +++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index fd9c314e2..14474c210 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -217,9 +217,42 @@ async fn user_provide_mismatched_embedding_dimension() { let new_document = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, ]); - let (value, code) = index.add_documents(new_document, None).await; + let (response, code) = index.add_documents(new_document, None).await; snapshot!(code, @"202 Accepted"); index.wait_task(response.uid()).await.succeeded(); + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 1.0 + ], + [ + 1.0, + 2.0, + 2.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); } async fn generate_default_user_provided_documents(server: &Server) -> Index { From 94ea263befc7f5e49ccbed6c27146dbb331dc95d Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 31 Mar 2025 13:43:28 +0200 Subject: [PATCH 4/6] Add new error for dimensions mismatch during indexing --- crates/meilisearch-types/src/error.rs | 5 ++++- crates/milli/src/error.rs | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch-types/src/error.rs b/crates/meilisearch-types/src/error.rs index 859563d8a..6c547d51e 100644 --- a/crates/meilisearch-types/src/error.rs +++ b/crates/meilisearch-types/src/error.rs @@ -454,7 +454,10 @@ impl ErrorCode for milli::Error { } UserError::CriterionError(_) => Code::InvalidSettingsRankingRules, UserError::InvalidGeoField { .. } => Code::InvalidDocumentGeoField, - UserError::InvalidVectorDimensions { .. } => Code::InvalidVectorDimensions, + UserError::InvalidVectorDimensions { .. } + | UserError::InvalidIndexingVectorDimensions { .. } => { + Code::InvalidVectorDimensions + } UserError::InvalidVectorsMapType { .. } | UserError::InvalidVectorsEmbedderConf { .. } => Code::InvalidVectorsType, UserError::TooManyVectors(_, _) => Code::TooManyVectors, diff --git a/crates/milli/src/error.rs b/crates/milli/src/error.rs index e1098cfa5..e61283e4c 100644 --- a/crates/milli/src/error.rs +++ b/crates/milli/src/error.rs @@ -129,6 +129,14 @@ and can not be more than 511 bytes.", .document_id.to_string() InvalidGeoField(#[from] GeoError), #[error("Invalid vector dimensions: expected: `{}`, found: `{}`.", .expected, .found)] InvalidVectorDimensions { expected: usize, found: usize }, + #[error("Invalid vector dimensions in document with id `{document_id}` in `._vectors.{embedder_name}`.\n - note: embedding #{embedding_index} has dimensions {found}\n - note: embedder `{embedder_name}` requires {expected}")] + InvalidIndexingVectorDimensions { + embedder_name: String, + document_id: String, + embedding_index: usize, + expected: usize, + found: usize, + }, #[error("The `_vectors` field in the document with id: `{document_id}` is not an object. Was expecting an object with a key for each embedder with manually provided vectors, but instead got `{value}`")] InvalidVectorsMapType { document_id: String, value: Value }, #[error("Bad embedder configuration in the document with id: `{document_id}`. {error}")] From f72986446668e9ea504b79d55e7e8505b00c0685 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 31 Mar 2025 13:43:57 +0200 Subject: [PATCH 5/6] Check dimension mismatch at insertion time --- .../src/update/new/extract/vectors/mod.rs | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index 6820ee67b..696864e7f 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -121,6 +121,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { // do we have set embeddings? if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( + update.external_document_id(), update.docid(), embeddings .into_vec(&context.doc_alloc, embedder_name) @@ -128,7 +129,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { document_id: update.external_document_id().to_string(), error: error.to_string(), })?, - ); + )?; } else if new_vectors.regenerate { let new_rendered = prompt.render_document( update.external_document_id(), @@ -209,6 +210,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { chunks.set_regenerate(insertion.docid(), new_vectors.regenerate); if let Some(embeddings) = new_vectors.embeddings { chunks.set_vectors( + insertion.external_document_id(), insertion.docid(), embeddings .into_vec(&context.doc_alloc, embedder_name) @@ -218,7 +220,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { .to_string(), error: error.to_string(), })?, - ); + )?; } else if new_vectors.regenerate { let rendered = prompt.render_document( insertion.external_document_id(), @@ -273,6 +275,7 @@ struct Chunks<'a, 'b, 'extractor> { embedder: &'a Embedder, embedder_id: u8, embedder_name: &'a str, + dimensions: usize, prompt: &'a Prompt, possible_embedding_mistakes: &'a PossibleEmbeddingMistakes, user_provided: &'a RefCell>, @@ -297,6 +300,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { let capacity = embedder.prompt_count_in_chunk_hint() * embedder.chunk_count_hint(); let texts = BVec::with_capacity_in(capacity, doc_alloc); let ids = BVec::with_capacity_in(capacity, doc_alloc); + let dimensions = embedder.dimensions(); Self { texts, ids, @@ -309,6 +313,7 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { embedder_name, user_provided, has_manual_generation: None, + dimensions, } } @@ -490,7 +495,25 @@ impl<'a, 'b, 'extractor> Chunks<'a, 'b, 'extractor> { } } - fn set_vectors(&self, docid: DocumentId, embeddings: Vec) { + fn set_vectors( + &self, + external_docid: &'a str, + docid: DocumentId, + embeddings: Vec, + ) -> Result<()> { + for (embedding_index, embedding) in embeddings.iter().enumerate() { + if embedding.len() != self.dimensions { + return Err(UserError::InvalidIndexingVectorDimensions { + expected: self.dimensions, + found: embedding.len(), + embedder_name: self.embedder_name.to_string(), + document_id: external_docid.to_string(), + embedding_index, + } + .into()); + } + } self.sender.set_vectors(docid, self.embedder_id, embeddings).unwrap(); + Ok(()) } } From 08ff135ad6c48d4936e4a45bc86219be208ff273 Mon Sep 17 00:00:00 2001 From: Louis Dureuil Date: Mon, 31 Mar 2025 15:26:31 +0200 Subject: [PATCH 6/6] Fix test --- crates/meilisearch/tests/vector/mod.rs | 60 +++++++++++--------------- 1 file changed, 25 insertions(+), 35 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 14474c210..5e34a4c23 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -188,7 +188,7 @@ async fn user_provide_mismatched_embedding_dimension() { let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); let task = index.wait_task(value.uid()).await; - snapshot!(task, @r#" + snapshot!(task, @r###" { "uid": "[uid]", "batchUid": "[batch_uid]", @@ -201,7 +201,7 @@ async fn user_provide_mismatched_embedding_dimension() { "indexedDocuments": 0 }, "error": { - "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.", + "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3", "code": "invalid_vector_dimensions", "type": "invalid_request", "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" @@ -211,46 +211,36 @@ async fn user_provide_mismatched_embedding_dimension() { "startedAt": "[date]", "finishedAt": "[date]" } - "#); + "###); - // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass let new_document = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, ]); let (response, code) = index.add_documents(new_document, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(response.uid()).await.succeeded(); - let (documents, _code) = index - .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) - .await; - snapshot!(json_string!(documents), @r###" + let task = index.wait_task(response.uid()).await; + snapshot!(task, @r###" { - "results": [ - { - "id": 0, - "name": "kefir", - "_vectors": { - "manual": { - "embeddings": [ - [ - 0.0, - 0.0, - 1.0 - ], - [ - 1.0, - 2.0, - 2.0 - ] - ], - "regenerate": false - } - } - } - ], - "offset": 0, - "limit": 20, - "total": 1 + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `doggo`: Invalid vector dimensions in document with id `0` in `._vectors.manual`.\n - note: embedding #0 has dimensions 2\n - note: embedder `manual` requires 3", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" } "###); }