From 9a6c1730aa265a255b6d0c2df9da5a613e8d8b65 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 10 Feb 2025 14:10:13 +0100 Subject: [PATCH] Add document database stats --- .../index-scheduler/src/index_mapper/mod.rs | 8 +- crates/index-scheduler/src/insta_snapshot.rs | 3 +- crates/index-scheduler/src/scheduler/test.rs | 10 +- crates/meilisearch/src/routes/indexes/mod.rs | 11 +- .../tests/documents/delete_documents.rs | 9 ++ crates/meilisearch/tests/dumps/mod.rs | 3 + crates/milli/src/database_stats.rs | 100 ++++++++++++++++++ crates/milli/src/index.rs | 6 ++ crates/milli/src/lib.rs | 1 + 9 files changed, 145 insertions(+), 6 deletions(-) create mode 100644 crates/milli/src/database_stats.rs diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index 17d683bbb..7b226ac01 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -6,6 +6,7 @@ use std::{fs, thread}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; use meilisearch_types::milli; +use meilisearch_types::milli::database_stats::DatabaseStats; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; @@ -98,8 +99,9 @@ pub enum IndexStatus { /// The statistics that can be computed from an `Index` object. #[derive(Serialize, Deserialize, Debug)] pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, + /// Stats of the documents database. + #[serde(default)] + pub documents_database_stats: DatabaseStats, /// Size taken up by the index' DB, in bytes. /// /// This includes the size taken by both the used and free pages of the DB, and as the free pages @@ -138,9 +140,9 @@ impl IndexStats { pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { let arroy_stats = index.arroy_stats(rtxn)?; Ok(IndexStats { - number_of_documents: index.number_of_documents(rtxn)?, number_of_embeddings: Some(arroy_stats.number_of_embeddings), number_of_embedded_documents: Some(arroy_stats.documents.len()), + documents_database_stats: index.documents_database_stats(rtxn)?, database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index 6f1863876..bcc295afd 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -370,7 +370,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String { let stats = mapper.stats_of(rtxn, &name).unwrap(); s.push_str(&format!( "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n", - stats.number_of_documents, stats.field_distribution + stats.documents_database_stats.number_of_entries(), + stats.field_distribution )); } diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index 44120ff64..ddce7b2e0 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -910,7 +910,15 @@ fn create_and_list_index() { [ "kefir", { - "number_of_documents": 0, + "documents_database_stats": { + "numberOfEntries": 0, + "totalKeySize": 0, + "totalValueSize": 0, + "maxKeySize": 0, + "maxValueSize": 0, + "minKeySize": 0, + "minValueSize": 0 + }, "database_size": "[bytes]", "number_of_embeddings": 0, "number_of_embedded_documents": 0, diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 7ca8e407f..6ccdb8e71 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -494,6 +494,12 @@ pub async fn delete_index( pub struct IndexStats { /// Number of documents in the index pub number_of_documents: u64, + /// Size of the documents database, in bytes. + pub raw_document_db_size: u64, + /// Maximum size of a document in the documents database. + pub max_document_size: u64, + /// Average size of a document in the documents database. + pub avg_document_size: u64, /// Whether or not the index is currently ingesting document pub is_indexing: bool, /// Number of embeddings in the index @@ -510,7 +516,10 @@ pub struct IndexStats { impl From for IndexStats { fn from(stats: index_scheduler::IndexStats) -> Self { IndexStats { - number_of_documents: stats.inner_stats.number_of_documents, + number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(), + raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), + max_document_size: stats.inner_stats.documents_database_stats.max_value_size(), + avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, number_of_embeddings: stats.inner_stats.number_of_embeddings, number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents, diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 62cc51f29..34a2c8325 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -160,6 +160,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 4, + "rawDocumentDbSize": 42, + "maxDocumentSize": 13, + "avgDocumentSize": 10, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -209,6 +212,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 16, + "maxDocumentSize": 12, + "avgDocumentSize": 8, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, @@ -277,6 +283,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 12, + "maxDocumentSize": 12, + "avgDocumentSize": 12, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index 55ee9dc93..21588ea90 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -187,6 +187,9 @@ async fn import_dump_v1_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "maxDocumentSize": 743, + "avgDocumentSize": 414, "isIndexing": false, "numberOfEmbeddings": 0, "numberOfEmbeddedDocuments": 0, diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs new file mode 100644 index 000000000..a823bb26d --- /dev/null +++ b/crates/milli/src/database_stats.rs @@ -0,0 +1,100 @@ +use heed::types::Bytes; +use heed::Database; +use heed::RoTxn; +use serde::{Deserialize, Serialize}; + +use crate::Result; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// The stats of a database. +pub struct DatabaseStats { + /// The number of entries in the database. + number_of_entries: u64, + /// The total size of the keys in the database. + total_key_size: u64, + /// The total size of the values in the database. + total_value_size: u64, + /// The maximum size of a key in the database. + max_key_size: u64, + /// The maximum size of a value in the database. + max_value_size: u64, + /// The minimum size of a key in the database. + min_key_size: u64, + /// The minimum size of a value in the database. + min_value_size: u64, +} + +impl DatabaseStats { + /// Returns the stats of the database. + /// + /// This function iterates over the whole database and computes the stats. + /// It is not efficient and should be cached somewhere. + pub(crate) fn new<'a>(database: Database, rtxn: &RoTxn<'a>) -> Result { + let mut database_stats = Self { + number_of_entries: 0, + total_key_size: 0, + total_value_size: 0, + max_key_size: 0, + max_value_size: 0, + min_key_size: u64::MAX, + min_value_size: u64::MAX, + }; + + let mut iter = database.iter(rtxn)?; + while let Some((key, value)) = iter.next().transpose()? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + database_stats.number_of_entries += 1; + database_stats.total_key_size += key_size; + database_stats.total_value_size += value_size; + database_stats.max_key_size = database_stats.max_key_size.max(key_size); + database_stats.max_value_size = database_stats.max_value_size.max(value_size); + database_stats.min_key_size = database_stats.min_key_size.min(key_size); + database_stats.min_value_size = database_stats.min_value_size.min(value_size); + } + + if database_stats.number_of_entries == 0 { + database_stats.min_key_size = 0; + database_stats.min_value_size = 0; + } + + Ok(database_stats) + } + + pub fn average_key_size(&self) -> u64 { + self.total_key_size / self.number_of_entries + } + + pub fn average_value_size(&self) -> u64 { + self.total_value_size / self.number_of_entries + } + + pub fn number_of_entries(&self) -> u64 { + self.number_of_entries + } + + pub fn total_key_size(&self) -> u64 { + self.total_key_size + } + + pub fn total_value_size(&self) -> u64 { + self.total_value_size + } + + pub fn max_key_size(&self) -> u64 { + self.max_key_size + } + + pub fn max_value_size(&self) -> u64 { + self.max_value_size + } + + pub fn min_key_size(&self) -> u64 { + self.min_key_size + } + + pub fn min_value_size(&self) -> u64 { + self.min_value_size + } +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index df1baed3c..f59d31321 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -11,6 +11,7 @@ use rstar::RTree; use serde::{Deserialize, Serialize}; use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; +use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; @@ -403,6 +404,11 @@ impl Index { Ok(count.unwrap_or_default()) } + /// Returns the stats of the database. + pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result { + Ok(DatabaseStats::new(self.documents.remap_types::(), rtxn)?) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index bb1532c1a..1d6d04fc7 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -10,6 +10,7 @@ pub mod documents; mod asc_desc; mod criterion; +pub mod database_stats; mod error; mod external_documents_ids; pub mod facet;