From 41203f0931006dcf96d895f71a7c3b51f2d289a3 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 11:37:47 +0100 Subject: [PATCH 1/8] Add embedders stats --- .../index-scheduler/src/index_mapper/mod.rs | 9 +++++ crates/meilisearch/src/routes/indexes/mod.rs | 10 ++++++ crates/milli/src/index.rs | 14 +++++++- crates/milli/src/vector/mod.rs | 35 +++++++++++++++++++ 4 files changed, 67 insertions(+), 1 deletion(-) diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index dad73d4c6..17d683bbb 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -106,6 +106,12 @@ pub struct IndexStats { /// are not returned to the disk after a deletion, this number is typically larger than /// `used_database_size` that only includes the size of the used pages. pub database_size: u64, + /// Number of embeddings in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embeddings: Option, + /// Number of embedded documents in the index. + /// Option: retrocompatible with the stats of the pre-v1.13.0 versions of meilisearch + pub number_of_embedded_documents: Option, /// Size taken by the used pages of the index' DB, in bytes. /// /// As the DB backend does not return to the disk the pages that are not currently used by the DB, @@ -130,8 +136,11 @@ impl IndexStats { /// /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { + let arroy_stats = index.arroy_stats(rtxn)?; Ok(IndexStats { number_of_documents: index.number_of_documents(rtxn)?, + number_of_embeddings: Some(arroy_stats.number_of_embeddings), + number_of_embedded_documents: Some(arroy_stats.documents.len()), database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index a03d5f691..7ca8e407f 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -496,6 +496,12 @@ pub struct IndexStats { pub number_of_documents: u64, /// Whether or not the index is currently ingesting document pub is_indexing: bool, + /// Number of embeddings in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embeddings: Option, + /// Number of embedded documents in the index + #[serde(skip_serializing_if = "Option::is_none")] + pub number_of_embedded_documents: Option, /// Association of every field name with the number of times it occurs in the documents. #[schema(value_type = HashMap)] pub field_distribution: FieldDistribution, @@ -506,6 +512,8 @@ impl From for IndexStats { IndexStats { number_of_documents: stats.inner_stats.number_of_documents, is_indexing: stats.is_indexing, + number_of_embeddings: stats.inner_stats.number_of_embeddings, + number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, field_distribution: stats.inner_stats.field_distribution, } } @@ -524,6 +532,8 @@ impl From for IndexStats { (status = OK, description = "The stats of the index", body = IndexStats, content_type = "application/json", example = json!( { "numberOfDocuments": 10, + "numberOfEmbeddings": 10, + "numberOfEmbeddedDocuments": 10, "isIndexing": true, "fieldDistribution": { "genre": 10, diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 944fb6cd4..0550965ed 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec; use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec}; use crate::order_by_map::OrderByMap; use crate::proximity::ProximityPrecision; -use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig}; +use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig}; use crate::{ default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds, FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec, @@ -1731,6 +1731,18 @@ impl Index { let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default(); Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 }) } + + pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result { + let mut stats = ArroyStats::default(); + let embedding_configs = self.embedding_configs(rtxn)?; + for config in embedding_configs { + let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap(); + let reader = + ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized()); + reader.aggregate_stats(rtxn, &mut stats)?; + } + Ok(stats) + } } #[derive(Debug, Deserialize, Serialize)] diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index 9ccd7341c..a8ae4a1d8 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -410,8 +410,43 @@ impl ArroyWrapper { fn quantized_db(&self) -> arroy::Database { self.database.remap_data_type() } + + pub fn aggregate_stats( + &self, + rtxn: &RoTxn, + stats: &mut ArroyStats, + ) -> Result<(), arroy::Error> { + if self.quantized { + for reader in self.readers(rtxn, self.quantized_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len() as u64; + } + } else { + for reader in self.readers(rtxn, self.angular_db()) { + let reader = reader?; + let documents = reader.item_ids(); + if documents.is_empty() { + break; + } + stats.documents |= documents; + stats.number_of_embeddings += documents.len() as u64; + } + } + + Ok(()) + } } +#[derive(Debug, Default, Clone)] +pub struct ArroyStats { + pub number_of_embeddings: u64, + pub documents: RoaringBitmap, +} /// One or multiple embeddings stored consecutively in a flat vector. pub struct Embeddings { data: Vec, From bd27fe7d02f51da176f9cfdef9dd9588f0cb5b1a Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 11:45:02 +0100 Subject: [PATCH 2/8] force dumpless upgrade to recompute stats --- crates/milli/src/update/upgrade/mod.rs | 8 +++++-- crates/milli/src/update/upgrade/v1_12.rs | 7 +++--- crates/milli/src/update/upgrade/v1_13.rs | 29 ++++++++++++++++++++++++ 3 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 crates/milli/src/update/upgrade/v1_13.rs diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index 5b7fda303..16f0eef7a 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -1,7 +1,9 @@ mod v1_12; +mod v1_13; use heed::RwTxn; -use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3}; +use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3}; +use v1_13::V1_13_0_To_Current; use crate::progress::{Progress, VariableNameStep}; use crate::{Index, InternalError, Result}; @@ -26,11 +28,13 @@ pub fn upgrade( progress: Progress, ) -> Result { let from = index.get_version(wtxn)?.unwrap_or(db_version); - let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()]; + let upgrade_functions: &[&dyn UpgradeIndex] = + &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()]; let start = match from { (1, 12, 0..=2) => 0, (1, 12, 3..) => 1, + (1, 13, 0) => 2, // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 13, _) => return Ok(false), (major, minor, patch) => { diff --git a/crates/milli/src/update/upgrade/v1_12.rs b/crates/milli/src/update/upgrade/v1_12.rs index 9086e920f..c3228213c 100644 --- a/crates/milli/src/update/upgrade/v1_12.rs +++ b/crates/milli/src/update/upgrade/v1_12.rs @@ -32,9 +32,9 @@ impl UpgradeIndex for V1_12_To_V1_12_3 { } #[allow(non_camel_case_types)] -pub(super) struct V1_12_3_To_Current(); +pub(super) struct V1_12_3_To_V1_13_0 {} -impl UpgradeIndex for V1_12_3_To_Current { +impl UpgradeIndex for V1_12_3_To_V1_13_0 { fn upgrade( &self, _wtxn: &mut RwTxn, @@ -42,7 +42,8 @@ impl UpgradeIndex for V1_12_3_To_Current { _original: (u32, u32, u32), _progress: Progress, ) -> Result { - Ok(false) + // recompute the indexes stats + Ok(true) } fn target_version(&self) -> (u32, u32, u32) { diff --git a/crates/milli/src/update/upgrade/v1_13.rs b/crates/milli/src/update/upgrade/v1_13.rs new file mode 100644 index 000000000..52246a7f3 --- /dev/null +++ b/crates/milli/src/update/upgrade/v1_13.rs @@ -0,0 +1,29 @@ +use heed::RwTxn; + +use super::UpgradeIndex; +use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; +use crate::progress::Progress; +use crate::{Index, Result}; + +#[allow(non_camel_case_types)] +pub(super) struct V1_13_0_To_Current(); + +impl UpgradeIndex for V1_13_0_To_Current { + fn upgrade( + &self, + _wtxn: &mut RwTxn, + _index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + Ok(false) + } + + fn target_version(&self) -> (u32, u32, u32) { + ( + VERSION_MAJOR.parse().unwrap(), + VERSION_MINOR.parse().unwrap(), + VERSION_PATCH.parse().unwrap(), + ) + } +} From c7aeb554b281d2ca132486b92df7cd3a0af82c45 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 13:37:41 +0100 Subject: [PATCH 3/8] Add tests --- crates/meilisearch/tests/stats/mod.rs | 251 ++++++++++++++++++++++++++ 1 file changed, 251 insertions(+) diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs index 1b4e458d3..70fc9d56a 100644 --- a/crates/meilisearch/tests/stats/mod.rs +++ b/crates/meilisearch/tests/stats/mod.rs @@ -1,3 +1,4 @@ +use meili_snap::{json_string, snapshot}; use time::format_description::well_known::Rfc3339; use time::OffsetDateTime; @@ -74,3 +75,253 @@ async fn stats() { assert_eq!(response["indexes"]["test"]["fieldDistribution"]["name"], 1); assert_eq!(response["indexes"]["test"]["fieldDistribution"]["age"], 1); } + +#[actix_rt::test] +async fn add_remove_embeddings() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + // 2 embedded documents for 4 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [1, 1, 1] }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 4, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 2 embedded documents for 3 embeddings in total + let documents = json!([ + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 2 embedded documents for 2 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": null, "handcrafted": [0, 0, 0] }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // 1 embedded documents for 2 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": null, "handcrafted": null }}, + ]); + + let (response, code) = index.update_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 1, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); +} + +#[actix_rt::test] +async fn add_remove_embedded_documents() { + let server = Server::new().await; + let index = server.index("doggo"); + + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + // 2 embedded documents for 4 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [1, 1, 1] }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 4, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // delete one embedded document, remaining 1 embedded documents for 2 embeddings in total + let (response, code) = index.delete_document(0).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 1, + "isIndexing": false, + "numberOfEmbeddings": 2, + "numberOfEmbeddedDocuments": 1, + "fieldDistribution": { + "id": 1, + "name": 1 + } + } + "###); +} + +#[actix_rt::test] +async fn update_embedder_settings() { + let server = Server::new().await; + let index = server.index("doggo"); + + // 2 embedded documents for 3 embeddings in total + // but no embedders are added in the settings yet so we expect 0 embedded documents for 0 embeddings in total + let documents = json!([ + {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": null }}, + ]); + + let (response, code) = index.add_documents(documents, None).await; + snapshot!(code, @"202 Accepted"); + index.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); + + // add embedders to the settings + // 2 embedded documents for 3 embeddings in total + let (response, code) = index + .update_settings(json!({ + "embedders": { + "manual": { + "source": "userProvided", + "dimensions": 3, + }, + "handcrafted": { + "source": "userProvided", + "dimensions": 3, + }, + + }, + })) + .await; + snapshot!(code, @"202 Accepted"); + server.wait_task(response.uid()).await.succeeded(); + + let (stats, _code) = index.stats().await; + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 2, + "isIndexing": false, + "numberOfEmbeddings": 3, + "numberOfEmbeddedDocuments": 2, + "fieldDistribution": { + "id": 2, + "name": 2 + } + } + "###); +} From 49e9655c24d007bab2de4460e5ca065a72cee508 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 14:05:32 +0100 Subject: [PATCH 4/8] Update snapshots --- crates/index-scheduler/src/scheduler/test.rs | 6 ++++-- crates/meilisearch/tests/documents/delete_documents.rs | 6 ++++++ crates/meilisearch/tests/dumps/mod.rs | 2 ++ crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs | 10 ++++++++-- 4 files changed, 20 insertions(+), 4 deletions(-) diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index a8ef88d56..44120ff64 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -903,7 +903,7 @@ fn create_and_list_index() { index_scheduler.index("kefir").unwrap(); let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap(); - snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r#" + snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###" [ 1, [ @@ -912,6 +912,8 @@ fn create_and_list_index() { { "number_of_documents": 0, "database_size": "[bytes]", + "number_of_embeddings": 0, + "number_of_embedded_documents": 0, "used_database_size": "[bytes]", "primary_key": null, "field_distribution": {}, @@ -921,5 +923,5 @@ fn create_and_list_index() { ] ] ] - "#); + "###); } diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 918343f94..62cc51f29 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -161,6 +161,8 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 4, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 3, "id": 4 @@ -208,6 +210,8 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 2, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 1, "id": 2 @@ -274,6 +278,8 @@ async fn delete_document_by_filter() { { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "color": 1, "id": 1 diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index b438006c5..abede9566 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -173,6 +173,8 @@ async fn import_dump_v1_movie_with_settings() { { "numberOfDocuments": 53, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "genres": 53, "id": 53, diff --git a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs index 1d364d855..6aab2861a 100644 --- a/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs +++ b/crates/meilisearch/tests/upgrade/v1_12/v1_12_0.rs @@ -135,6 +135,8 @@ async fn check_the_index_scheduler(server: &Server) { "kefir": { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "age": 1, "description": 1, @@ -215,6 +217,8 @@ async fn check_the_index_scheduler(server: &Server) { "kefir": { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "age": 1, "description": 1, @@ -228,10 +232,12 @@ async fn check_the_index_scheduler(server: &Server) { "###); let index = server.index("kefir"); let (stats, _) = index.stats().await; - snapshot!(stats, @r#" + snapshot!(stats, @r###" { "numberOfDocuments": 1, "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, "fieldDistribution": { "age": 1, "description": 1, @@ -240,7 +246,7 @@ async fn check_the_index_scheduler(server: &Server) { "surname": 1 } } - "#); + "###); // Delete all the tasks of a specific batch let (task, _) = server.delete_tasks("batchUids=10").await; From a65c52cc97cb72604cce3ff45df058092870cb36 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Tue, 11 Feb 2025 15:16:10 +0100 Subject: [PATCH 5/8] Convert dump test into snapshots --- crates/meilisearch/tests/dumps/mod.rs | 304 +++++++++++++++++++++----- 1 file changed, 248 insertions(+), 56 deletions(-) diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index abede9566..1b07afdfd 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -27,9 +27,24 @@ async fn import_dump_v1_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -335,9 +350,24 @@ async fn import_dump_v1_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -485,9 +515,24 @@ async fn import_dump_v2_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -625,9 +670,24 @@ async fn import_dump_v2_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -775,9 +835,24 @@ async fn import_dump_v2_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -922,9 +997,24 @@ async fn import_dump_v3_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1062,9 +1152,24 @@ async fn import_dump_v3_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1212,9 +1317,24 @@ async fn import_dump_v3_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1359,9 +1479,24 @@ async fn import_dump_v4_movie_raw() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1499,9 +1634,24 @@ async fn import_dump_v4_movie_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"genres": 53, "id": 53, "overview": 53, "poster": 53, "release_date": 53, "title": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "genres": 53, + "id": 53, + "overview": 53, + "poster": 53, + "release_date": 53, + "title": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1649,9 +1799,24 @@ async fn import_dump_v4_rubygems_with_settings() { let (stats, code) = index.stats().await; snapshot!(code, @"200 OK"); - assert_eq!( - stats, - json!({ "numberOfDocuments": 53, "isIndexing": false, "fieldDistribution": {"description": 53, "id": 53, "name": 53, "summary": 53, "total_downloads": 53, "version": 53 }}) + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 53, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "description": 53, + "id": 53, + "name": 53, + "summary": 53, + "total_downloads": 53, + "version": 53 + } + } + "### ); let (settings, code) = index.settings().await; @@ -1800,33 +1965,35 @@ async fn import_dump_v5() { server.wait_task(task["uid"].as_u64().unwrap()).await; } - let expected_stats = json!({ - "numberOfDocuments": 10, - "isIndexing": false, - "fieldDistribution": { - "cast": 10, - "director": 10, - "genres": 10, - "id": 10, - "overview": 10, - "popularity": 10, - "poster_path": 10, - "producer": 10, - "production_companies": 10, - "release_date": 10, - "tagline": 10, - "title": 10, - "vote_average": 10, - "vote_count": 10 - } - }); - let index1 = server.index("test"); let index2 = server.index("test2"); let (stats, code) = index1.stats().await; snapshot!(code, @"200 OK"); - assert_eq!(stats, expected_stats); + snapshot!(json_string!(stats), @r###" + { + "numberOfDocuments": 10, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "cast": 10, + "director": 10, + "genres": 10, + "id": 10, + "overview": 10, + "popularity": 10, + "poster_path": 10, + "producer": 10, + "production_companies": 10, + "release_date": 10, + "tagline": 10, + "title": 10, + "vote_average": 10, + "vote_count": 10 + } + } + "###); let (docs, code) = index2.get_all_documents(GetAllDocumentsOptions::default()).await; snapshot!(code, @"200 OK"); @@ -1837,7 +2004,32 @@ async fn import_dump_v5() { let (stats, code) = index2.stats().await; snapshot!(code, @"200 OK"); - assert_eq!(stats, expected_stats); + snapshot!( + json_string!(stats), + @r###" + { + "numberOfDocuments": 10, + "isIndexing": false, + "numberOfEmbeddings": 0, + "numberOfEmbeddedDocuments": 0, + "fieldDistribution": { + "cast": 10, + "director": 10, + "genres": 10, + "id": 10, + "overview": 10, + "popularity": 10, + "poster_path": 10, + "producer": 10, + "production_companies": 10, + "release_date": 10, + "tagline": 10, + "title": 10, + "vote_average": 10, + "vote_count": 10 + } + } + "###); let (keys, code) = server.list_api_keys("").await; snapshot!(code, @"200 OK"); From 8419ed52a12928445185d3cebefa7312964a24bd Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 14:38:51 +0100 Subject: [PATCH 6/8] fix clippy --- crates/milli/src/vector/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/milli/src/vector/mod.rs b/crates/milli/src/vector/mod.rs index a8ae4a1d8..74b52b1fe 100644 --- a/crates/milli/src/vector/mod.rs +++ b/crates/milli/src/vector/mod.rs @@ -424,7 +424,7 @@ impl ArroyWrapper { break; } stats.documents |= documents; - stats.number_of_embeddings += documents.len() as u64; + stats.number_of_embeddings += documents.len(); } } else { for reader in self.readers(rtxn, self.angular_db()) { @@ -434,7 +434,7 @@ impl ArroyWrapper { break; } stats.documents |= documents; - stats.number_of_embeddings += documents.len() as u64; + stats.number_of_embeddings += documents.len(); } } From 1caad4c4b02c2efc6136e48522387531c617317d Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 16:13:34 +0100 Subject: [PATCH 7/8] Add multiple embeddings for the same embedder in tests --- crates/meilisearch/tests/stats/mod.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/crates/meilisearch/tests/stats/mod.rs b/crates/meilisearch/tests/stats/mod.rs index 70fc9d56a..bb10d2cd5 100644 --- a/crates/meilisearch/tests/stats/mod.rs +++ b/crates/meilisearch/tests/stats/mod.rs @@ -99,10 +99,10 @@ async fn add_remove_embeddings() { snapshot!(code, @"202 Accepted"); server.wait_task(response.uid()).await.succeeded(); - // 2 embedded documents for 4 embeddings in total + // 2 embedded documents for 5 embeddings in total let documents = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, - {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [1, 1, 1] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, ]); let (response, code) = index.add_documents(documents, None).await; @@ -114,7 +114,7 @@ async fn add_remove_embeddings() { { "numberOfDocuments": 2, "isIndexing": false, - "numberOfEmbeddings": 4, + "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, "fieldDistribution": { "id": 2, @@ -217,10 +217,10 @@ async fn add_remove_embedded_documents() { snapshot!(code, @"202 Accepted"); server.wait_task(response.uid()).await.succeeded(); - // 2 embedded documents for 4 embeddings in total + // 2 embedded documents for 5 embeddings in total let documents = json!([ {"id": 0, "name": "kefir", "_vectors": { "manual": [0, 0, 0], "handcrafted": [0, 0, 0] }}, - {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [1, 1, 1] }}, + {"id": 1, "name": "echo", "_vectors": { "manual": [1, 1, 1], "handcrafted": [[1, 1, 1], [2, 2, 2]] }}, ]); let (response, code) = index.add_documents(documents, None).await; @@ -232,7 +232,7 @@ async fn add_remove_embedded_documents() { { "numberOfDocuments": 2, "isIndexing": false, - "numberOfEmbeddings": 4, + "numberOfEmbeddings": 5, "numberOfEmbeddedDocuments": 2, "fieldDistribution": { "id": 2, @@ -241,7 +241,7 @@ async fn add_remove_embedded_documents() { } "###); - // delete one embedded document, remaining 1 embedded documents for 2 embeddings in total + // delete one embedded document, remaining 1 embedded documents for 3 embeddings in total let (response, code) = index.delete_document(0).await; snapshot!(code, @"202 Accepted"); index.wait_task(response.uid()).await.succeeded(); @@ -251,7 +251,7 @@ async fn add_remove_embedded_documents() { { "numberOfDocuments": 1, "isIndexing": false, - "numberOfEmbeddings": 2, + "numberOfEmbeddings": 3, "numberOfEmbeddedDocuments": 1, "fieldDistribution": { "id": 1, From c55fdad2c35316effb3944c3cf6e9f83aea16b9c Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Wed, 12 Feb 2025 16:23:10 +0100 Subject: [PATCH 8/8] Fix dumpless upgrade target version --- crates/milli/src/update/upgrade/v1_12.rs | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/crates/milli/src/update/upgrade/v1_12.rs b/crates/milli/src/update/upgrade/v1_12.rs index c3228213c..f46e7f745 100644 --- a/crates/milli/src/update/upgrade/v1_12.rs +++ b/crates/milli/src/update/upgrade/v1_12.rs @@ -1,7 +1,6 @@ use heed::RwTxn; use super::UpgradeIndex; -use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH}; use crate::progress::Progress; use crate::{make_enum_progress, Index, Result}; @@ -47,10 +46,6 @@ impl UpgradeIndex for V1_12_3_To_V1_13_0 { } fn target_version(&self) -> (u32, u32, u32) { - ( - VERSION_MAJOR.parse().unwrap(), - VERSION_MINOR.parse().unwrap(), - VERSION_PATCH.parse().unwrap(), - ) + (1, 13, 0) } }