From e019ad7692f64be9ae3e9fe88197d544a83a5bcb Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Fri, 21 Mar 2025 15:41:31 +0100 Subject: [PATCH 01/10] Display more detailed error message instead of panic --- crates/milli/src/update/new/indexer/write.rs | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 8618b4b21..2df19bcd6 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -13,7 +13,7 @@ use crate::index::IndexEmbeddingConfig; use crate::progress::Progress; use crate::update::settings::InnerIndexSettings; use crate::vector::{ArroyWrapper, Embedder, EmbeddingConfigs, Embeddings}; -use crate::{Error, Index, InternalError, Result}; +use crate::{Error, Index, InternalError, Result, UserError}; pub fn write_to_db( mut writer_receiver: WriterBbqueueReceiver<'_>, @@ -218,6 +218,12 @@ pub fn write_from_bbqueue( arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); + if all_embeddings.len() != *dimensions { + return Err(Error::UserError(UserError::InvalidVectorDimensions { + expected: *dimensions, + found: all_embeddings.len(), + })); + } embeddings.append(all_embeddings.to_vec()).unwrap(); writer.del_items(wtxn, *dimensions, docid)?; writer.add_items(wtxn, docid, &embeddings)?; From 868c90293589250df8e468d2185930fbc1dbccad Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Mon, 24 Mar 2025 00:24:50 +0100 Subject: [PATCH 02/10] fix meilisearch integration vector tests --- crates/meilisearch/tests/vector/mod.rs | 46 +++++++++++++------ .../document-deleted.snap | 11 +++-- 2 files changed, 40 insertions(+), 17 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 67da51702..75b00127f 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -95,12 +95,12 @@ async fn add_remove_user_provided() { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await.succeeded(); + index.wait_task(value.uid()).await.failed(); let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) .await; - snapshot!(json_string!(documents), @r###" + snapshot!(json_string!(documents), @r#" { "results": [ { @@ -110,9 +110,9 @@ async fn add_remove_user_provided() { "manual": { "embeddings": [ [ - 10.0, - 10.0, - 10.0 + 0.0, + 0.0, + 0.0 ] ], "regenerate": false @@ -124,7 +124,13 @@ async fn add_remove_user_provided() { "name": "echo", "_vectors": { "manual": { - "embeddings": [], + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], "regenerate": false } } @@ -134,7 +140,7 @@ async fn add_remove_user_provided() { "limit": 20, "total": 2 } - "###); + "#); let (value, code) = index.delete_document(0).await; snapshot!(code, @"202 Accepted"); @@ -143,7 +149,7 @@ async fn add_remove_user_provided() { let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) .await; - snapshot!(json_string!(documents), @r###" + snapshot!(json_string!(documents), @r#" { "results": [ { @@ -151,7 +157,13 @@ async fn add_remove_user_provided() { "name": "echo", "_vectors": { "manual": { - "embeddings": [], + "embeddings": [ + [ + 1.0, + 1.0, + 1.0 + ] + ], "regenerate": false } } @@ -161,7 +173,7 @@ async fn add_remove_user_provided() { "limit": 20, "total": 1 } - "###); + "#); } async fn generate_default_user_provided_documents(server: &Server) -> Index { @@ -189,7 +201,7 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await.succeeded(); + index.wait_task(value.uid()).await.failed(); index } @@ -678,7 +690,7 @@ async fn add_remove_one_vector_4588() { let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) .await; - snapshot!(json_string!(documents), @r###" + snapshot!(json_string!(documents), @r#" { "results": [ { @@ -686,7 +698,13 @@ async fn add_remove_one_vector_4588() { "name": "kefir", "_vectors": { "manual": { - "embeddings": [], + "embeddings": [ + [ + 0.0, + 0.0, + 0.0 + ] + ], "regenerate": false } } @@ -696,5 +714,5 @@ async fn add_remove_one_vector_4588() { "limit": 20, "total": 1 } - "###); + "#); } diff --git a/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap b/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap index 709dfeae0..cbc8ead8f 100644 --- a/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap +++ b/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap @@ -5,14 +5,19 @@ source: crates/meilisearch/tests/vector/mod.rs "uid": "[uid]", "batchUid": "[batch_uid]", "indexUid": "doggo", - "status": "succeeded", + "status": "failed", "type": "documentAdditionOrUpdate", "canceledBy": null, "details": { "receivedDocuments": 1, - "indexedDocuments": 1 + "indexedDocuments": 0 + }, + "error": { + "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `0`.", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" }, - "error": null, "duration": "[duration]", "enqueuedAt": "[date]", "startedAt": "[date]", From d71c6f3483a2bbd26144dc980dbc36f45a21feba Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 12:04:25 +0100 Subject: [PATCH 03/10] allow multiple embedding in per document per embedder to pass --- crates/milli/src/update/new/indexer/write.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 2df19bcd6..6f7e212b1 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -218,7 +218,8 @@ pub fn write_from_bbqueue( arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); - if all_embeddings.len() != *dimensions { + // FIXME: /!\ Case where #embeddings is divisor of `dimensions` would still pass + if all_embeddings.len() % *dimensions != 0 { return Err(Error::UserError(UserError::InvalidVectorDimensions { expected: *dimensions, found: all_embeddings.len(), From 6b1c262b7456ec864836028428a2f3db57fcaa3c Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 12:43:15 +0100 Subject: [PATCH 04/10] fix all tests --- crates/meilisearch/tests/vector/mod.rs | 2 +- crates/milli/src/update/new/indexer/write.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 75b00127f..2ac1bbdac 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -201,7 +201,7 @@ async fn generate_default_user_provided_documents(server: &Server) -> Index { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await.failed(); + index.wait_task(value.uid()).await.succeeded(); index } diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 6f7e212b1..77cb84ab6 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -219,7 +219,7 @@ pub fn write_from_bbqueue( let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); // FIXME: /!\ Case where #embeddings is divisor of `dimensions` would still pass - if all_embeddings.len() % *dimensions != 0 { + if *dimensions!= 0 && all_embeddings.len() % *dimensions != 0 { return Err(Error::UserError(UserError::InvalidVectorDimensions { expected: *dimensions, found: all_embeddings.len(), From 38b3e03dde0ad241781bcf076971661df11ccf6b Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 12:51:36 +0100 Subject: [PATCH 05/10] add embedding with dimension mismatch test case --- crates/meilisearch/tests/vector/mod.rs | 50 ++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 2ac1bbdac..a86acd307 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -176,6 +176,56 @@ async fn add_remove_user_provided() { "#); } +#[actix_rt::test] +async fn user_provide_mismatched_embedding_dimension() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + } + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0] }}, + ]); + let (value, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + let task = index.wait_task(value.uid()).await; + snapshot!(task, @r#" + { + "uid": "[uid]", + "batchUid": "[batch_uid]", + "indexUid": "doggo", + "status": "failed", + "type": "documentAdditionOrUpdate", + "canceledBy": null, + "details": { + "receivedDocuments": 1, + "indexedDocuments": 0 + }, + "error": { + "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `2`.", + "code": "invalid_vector_dimensions", + "type": "invalid_request", + "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + }, + "duration": "[duration]", + "enqueuedAt": "[date]", + "startedAt": "[date]", + "finishedAt": "[date]" + } + "#); +} + async fn generate_default_user_provided_documents(server: &Server) -> Index { let index = server.index("doggo"); From 18bc56f1fa9a8149af36a0ce1e9c51fe386d379a Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 12:54:49 +0100 Subject: [PATCH 06/10] update cargo insta --- .../add_remove_one_vector_4588/document-deleted.snap | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap b/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap index cbc8ead8f..709dfeae0 100644 --- a/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap +++ b/crates/meilisearch/tests/vector/snapshots/mod.rs/add_remove_one_vector_4588/document-deleted.snap @@ -5,19 +5,14 @@ source: crates/meilisearch/tests/vector/mod.rs "uid": "[uid]", "batchUid": "[batch_uid]", "indexUid": "doggo", - "status": "failed", + "status": "succeeded", "type": "documentAdditionOrUpdate", "canceledBy": null, "details": { "receivedDocuments": 1, - "indexedDocuments": 0 - }, - "error": { - "message": "Index `doggo`: Invalid vector dimensions: expected: `3`, found: `0`.", - "code": "invalid_vector_dimensions", - "type": "invalid_request", - "link": "https://docs.meilisearch.com/errors#invalid_vector_dimensions" + "indexedDocuments": 1 }, + "error": null, "duration": "[duration]", "enqueuedAt": "[date]", "startedAt": "[date]", From a8c407fa36bead1a3c6156967ca0dc82dbaa5f25 Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 13:06:11 +0100 Subject: [PATCH 07/10] fix failling tests --- crates/meilisearch/tests/vector/mod.rs | 32 ++++++-------------------- 1 file changed, 7 insertions(+), 25 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index a86acd307..5b718bf7d 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -95,7 +95,7 @@ async fn add_remove_user_provided() { ]); let (value, code) = index.add_documents(documents, None).await; snapshot!(code, @"202 Accepted"); - index.wait_task(value.uid()).await.failed(); + index.wait_task(value.uid()).await.succeeded(); let (documents, _code) = index .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) @@ -110,9 +110,9 @@ async fn add_remove_user_provided() { "manual": { "embeddings": [ [ - 0.0, - 0.0, - 0.0 + 10.0, + 10.0, + 10.0 ] ], "regenerate": false @@ -124,13 +124,7 @@ async fn add_remove_user_provided() { "name": "echo", "_vectors": { "manual": { - "embeddings": [ - [ - 1.0, - 1.0, - 1.0 - ] - ], + "embeddings": [], "regenerate": false } } @@ -157,13 +151,7 @@ async fn add_remove_user_provided() { "name": "echo", "_vectors": { "manual": { - "embeddings": [ - [ - 1.0, - 1.0, - 1.0 - ] - ], + "embeddings": [], "regenerate": false } } @@ -748,13 +736,7 @@ async fn add_remove_one_vector_4588() { "name": "kefir", "_vectors": { "manual": { - "embeddings": [ - [ - 0.0, - 0.0, - 0.0 - ] - ], + "embeddings": [], "regenerate": false } } From 43c8a206b43c61712e4a5a7861b82eecc2fc6ddb Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Tue, 25 Mar 2025 13:07:17 +0100 Subject: [PATCH 08/10] detail comments --- crates/milli/src/update/new/indexer/write.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 77cb84ab6..be8788ab0 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -218,7 +218,7 @@ pub fn write_from_bbqueue( arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); - // FIXME: /!\ Case where #embeddings is divisor of `dimensions` would still pass + // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass if *dimensions!= 0 && all_embeddings.len() % *dimensions != 0 { return Err(Error::UserError(UserError::InvalidVectorDimensions { expected: *dimensions, From bf3a29b60d382b1a264a62f74f123ae487b4dcdb Mon Sep 17 00:00:00 2001 From: vuthanhtung2412 Date: Wed, 26 Mar 2025 12:57:25 +0100 Subject: [PATCH 09/10] Document problematic case in test and acknowledge PR comment --- crates/meilisearch/tests/vector/mod.rs | 8 ++++++++ crates/milli/src/update/new/indexer/write.rs | 4 +--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 5b718bf7d..32b77aa98 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -212,6 +212,14 @@ async fn user_provide_mismatched_embedding_dimension() { "finishedAt": "[date]" } "#); + + // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass + let new_document = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, + ]); + let (value, code) = index.add_documents(new_document, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); } async fn generate_default_user_provided_documents(server: &Server) -> Index { diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index be8788ab0..ca860bbff 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -218,14 +218,12 @@ pub fn write_from_bbqueue( arroy_writers.get(&embedder_id).expect("requested a missing embedder"); let mut embeddings = Embeddings::new(*dimensions); let all_embeddings = asvs.read_all_embeddings_into_vec(frame, aligned_embedding); - // FIXME: /!\ Case where number of embeddings is divisor of `dimensions` would still pass - if *dimensions!= 0 && all_embeddings.len() % *dimensions != 0 { + if embeddings.append(all_embeddings.to_vec()).is_err() { return Err(Error::UserError(UserError::InvalidVectorDimensions { expected: *dimensions, found: all_embeddings.len(), })); } - embeddings.append(all_embeddings.to_vec()).unwrap(); writer.del_items(wtxn, *dimensions, docid)?; writer.add_items(wtxn, docid, &embeddings)?; } From a8afd5dbcb7aca82005d0928c262445635bff508 Mon Sep 17 00:00:00 2001 From: Tamo Date: Thu, 27 Mar 2025 11:07:01 +0100 Subject: [PATCH 10/10] fix warn and show what meilisearch understood of the vectors in the cursed test --- crates/meilisearch/tests/vector/mod.rs | 35 +++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/crates/meilisearch/tests/vector/mod.rs b/crates/meilisearch/tests/vector/mod.rs index 32b77aa98..2a7038fcb 100644 --- a/crates/meilisearch/tests/vector/mod.rs +++ b/crates/meilisearch/tests/vector/mod.rs @@ -217,9 +217,42 @@ async fn user_provide_mismatched_embedding_dimension() { let new_document = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [[0, 0], [1, 1], [2, 2]] }}, ]); - let (value, code) = index.add_documents(new_document, None).await; + let (response, code) = index.add_documents(new_document, None).await; snapshot!(code, @"202 Accepted"); index.wait_task(response.uid()).await.succeeded(); + let (documents, _code) = index + .get_all_documents(GetAllDocumentsOptions { retrieve_vectors: true, ..Default::default() }) + .await; + snapshot!(json_string!(documents), @r###" + { + "results": [ + { + "id": 0, + "name": "kefir", + "_vectors": { + "manual": { + "embeddings": [ + [ + 0.0, + 0.0, + 1.0 + ], + [ + 1.0, + 2.0, + 2.0 + ] + ], + "regenerate": false + } + } + } + ], + "offset": 0, + "limit": 20, + "total": 1 + } + "###); } async fn generate_default_user_provided_documents(server: &Server) -> Index {