diff --git a/crates/index-scheduler/src/index_mapper/mod.rs b/crates/index-scheduler/src/index_mapper/mod.rs index dad73d4c6..c01b05e5a 100644 --- a/crates/index-scheduler/src/index_mapper/mod.rs +++ b/crates/index-scheduler/src/index_mapper/mod.rs @@ -6,6 +6,7 @@ use std::{fs, thread}; use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; use meilisearch_types::milli; +use meilisearch_types::milli::database_stats::DatabaseStats; use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::{FieldDistribution, Index}; use serde::{Deserialize, Serialize}; @@ -98,8 +99,9 @@ pub enum IndexStatus { /// The statistics that can be computed from an `Index` object. #[derive(Serialize, Deserialize, Debug)] pub struct IndexStats { - /// Number of documents in the index. - pub number_of_documents: u64, + /// Stats of the documents database. + #[serde(default)] + pub documents_database_stats: DatabaseStats, /// Size taken up by the index' DB, in bytes. /// /// This includes the size taken by both the used and free pages of the DB, and as the free pages @@ -131,7 +133,7 @@ impl IndexStats { /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result { Ok(IndexStats { - number_of_documents: index.number_of_documents(rtxn)?, + documents_database_stats: index.documents_database_stats(rtxn)?, database_size: index.on_disk_size()?, used_database_size: index.used_size()?, primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), diff --git a/crates/index-scheduler/src/insta_snapshot.rs b/crates/index-scheduler/src/insta_snapshot.rs index bb8827fdc..261b71e69 100644 --- a/crates/index-scheduler/src/insta_snapshot.rs +++ b/crates/index-scheduler/src/insta_snapshot.rs @@ -365,7 +365,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String { let stats = mapper.stats_of(rtxn, &name).unwrap(); s.push_str(&format!( "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n", - stats.number_of_documents, stats.field_distribution + stats.documents_database_stats.number_of_entries(), + stats.field_distribution )); } diff --git a/crates/index-scheduler/src/scheduler/test.rs b/crates/index-scheduler/src/scheduler/test.rs index a8ef88d56..3293e833e 100644 --- a/crates/index-scheduler/src/scheduler/test.rs +++ b/crates/index-scheduler/src/scheduler/test.rs @@ -903,14 +903,22 @@ fn create_and_list_index() { index_scheduler.index("kefir").unwrap(); let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap(); - snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r#" + snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###" [ 1, [ [ "kefir", { - "number_of_documents": 0, + "documents_database_stats": { + "numberOfEntries": 0, + "totalKeySize": 0, + "totalValueSize": 0, + "maxKeySize": 0, + "maxValueSize": 0, + "minKeySize": 0, + "minValueSize": 0 + }, "database_size": "[bytes]", "used_database_size": "[bytes]", "primary_key": null, @@ -921,5 +929,5 @@ fn create_and_list_index() { ] ] ] - "#); + "###); } diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index a03d5f691..863e47339 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -494,6 +494,12 @@ pub async fn delete_index( pub struct IndexStats { /// Number of documents in the index pub number_of_documents: u64, + /// Size of the documents database, in bytes. + pub raw_document_db_size: u64, + /// Maximum size of a document in the documents database. + pub max_document_size: u64, + /// Average size of a document in the documents database. + pub avg_document_size: u64, /// Whether or not the index is currently ingesting document pub is_indexing: bool, /// Association of every field name with the number of times it occurs in the documents. @@ -504,7 +510,10 @@ pub struct IndexStats { impl From for IndexStats { fn from(stats: index_scheduler::IndexStats) -> Self { IndexStats { - number_of_documents: stats.inner_stats.number_of_documents, + number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(), + raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), + max_document_size: stats.inner_stats.documents_database_stats.max_value_size(), + avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, field_distribution: stats.inner_stats.field_distribution, } diff --git a/crates/meilisearch/tests/documents/delete_documents.rs b/crates/meilisearch/tests/documents/delete_documents.rs index 918343f94..b653c1a24 100644 --- a/crates/meilisearch/tests/documents/delete_documents.rs +++ b/crates/meilisearch/tests/documents/delete_documents.rs @@ -160,6 +160,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 4, + "rawDocumentDbSize": 42, + "maxDocumentSize": 13, + "avgDocumentSize": 10, "isIndexing": false, "fieldDistribution": { "color": 3, @@ -207,6 +210,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 2, + "rawDocumentDbSize": 16, + "maxDocumentSize": 12, + "avgDocumentSize": 8, "isIndexing": false, "fieldDistribution": { "color": 1, @@ -273,6 +279,9 @@ async fn delete_document_by_filter() { snapshot!(json_string!(stats), @r###" { "numberOfDocuments": 1, + "rawDocumentDbSize": 12, + "maxDocumentSize": 12, + "avgDocumentSize": 12, "isIndexing": false, "fieldDistribution": { "color": 1, diff --git a/crates/meilisearch/tests/dumps/mod.rs b/crates/meilisearch/tests/dumps/mod.rs index a2b008fe3..73e97958d 100644 --- a/crates/meilisearch/tests/dumps/mod.rs +++ b/crates/meilisearch/tests/dumps/mod.rs @@ -172,6 +172,9 @@ async fn import_dump_v1_movie_with_settings() { @r###" { "numberOfDocuments": 53, + "rawDocumentDbSize": 21965, + "maxDocumentSize": 743, + "avgDocumentSize": 414, "isIndexing": false, "fieldDistribution": { "genres": 53, diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs new file mode 100644 index 000000000..a823bb26d --- /dev/null +++ b/crates/milli/src/database_stats.rs @@ -0,0 +1,100 @@ +use heed::types::Bytes; +use heed::Database; +use heed::RoTxn; +use serde::{Deserialize, Serialize}; + +use crate::Result; + +#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] +#[serde(rename_all = "camelCase")] +/// The stats of a database. +pub struct DatabaseStats { + /// The number of entries in the database. + number_of_entries: u64, + /// The total size of the keys in the database. + total_key_size: u64, + /// The total size of the values in the database. + total_value_size: u64, + /// The maximum size of a key in the database. + max_key_size: u64, + /// The maximum size of a value in the database. + max_value_size: u64, + /// The minimum size of a key in the database. + min_key_size: u64, + /// The minimum size of a value in the database. + min_value_size: u64, +} + +impl DatabaseStats { + /// Returns the stats of the database. + /// + /// This function iterates over the whole database and computes the stats. + /// It is not efficient and should be cached somewhere. + pub(crate) fn new<'a>(database: Database, rtxn: &RoTxn<'a>) -> Result { + let mut database_stats = Self { + number_of_entries: 0, + total_key_size: 0, + total_value_size: 0, + max_key_size: 0, + max_value_size: 0, + min_key_size: u64::MAX, + min_value_size: u64::MAX, + }; + + let mut iter = database.iter(rtxn)?; + while let Some((key, value)) = iter.next().transpose()? { + let key_size = key.len() as u64; + let value_size = value.len() as u64; + database_stats.number_of_entries += 1; + database_stats.total_key_size += key_size; + database_stats.total_value_size += value_size; + database_stats.max_key_size = database_stats.max_key_size.max(key_size); + database_stats.max_value_size = database_stats.max_value_size.max(value_size); + database_stats.min_key_size = database_stats.min_key_size.min(key_size); + database_stats.min_value_size = database_stats.min_value_size.min(value_size); + } + + if database_stats.number_of_entries == 0 { + database_stats.min_key_size = 0; + database_stats.min_value_size = 0; + } + + Ok(database_stats) + } + + pub fn average_key_size(&self) -> u64 { + self.total_key_size / self.number_of_entries + } + + pub fn average_value_size(&self) -> u64 { + self.total_value_size / self.number_of_entries + } + + pub fn number_of_entries(&self) -> u64 { + self.number_of_entries + } + + pub fn total_key_size(&self) -> u64 { + self.total_key_size + } + + pub fn total_value_size(&self) -> u64 { + self.total_value_size + } + + pub fn max_key_size(&self) -> u64 { + self.max_key_size + } + + pub fn max_value_size(&self) -> u64 { + self.max_value_size + } + + pub fn min_key_size(&self) -> u64 { + self.min_key_size + } + + pub fn min_value_size(&self) -> u64 { + self.min_value_size + } +} diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index 944fb6cd4..2119206af 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -11,6 +11,7 @@ use rstar::RTree; use serde::{Deserialize, Serialize}; use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; +use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::error::{InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; @@ -403,6 +404,11 @@ impl Index { Ok(count.unwrap_or_default()) } + /// Returns the stats of the database. + pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result { + Ok(DatabaseStats::new(self.documents.remap_types::(), rtxn)?) + } + /* primary key */ /// Writes the documents primary key, this is the field name that is used to store the id. diff --git a/crates/milli/src/lib.rs b/crates/milli/src/lib.rs index ea88d2b78..12b5fbc2e 100644 --- a/crates/milli/src/lib.rs +++ b/crates/milli/src/lib.rs @@ -10,6 +10,7 @@ pub mod documents; mod asc_desc; mod criterion; +pub mod database_stats; mod error; mod external_documents_ids; pub mod facet; diff --git a/crates/milli/src/update/upgrade/mod.rs b/crates/milli/src/update/upgrade/mod.rs index 5b7fda303..9833faac5 100644 --- a/crates/milli/src/update/upgrade/mod.rs +++ b/crates/milli/src/update/upgrade/mod.rs @@ -1,7 +1,7 @@ mod v1_12; use heed::RwTxn; -use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3}; +use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3, V1_13_0_To_Current}; use crate::progress::{Progress, VariableNameStep}; use crate::{Index, InternalError, Result}; @@ -26,11 +26,13 @@ pub fn upgrade( progress: Progress, ) -> Result { let from = index.get_version(wtxn)?.unwrap_or(db_version); - let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()]; + let upgrade_functions: &[&dyn UpgradeIndex] = + &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0(), &V1_13_0_To_Current()]; let start = match from { (1, 12, 0..=2) => 0, (1, 12, 3..) => 1, + (1, 13, 0) => 2, // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. (1, 13, _) => return Ok(false), (major, minor, patch) => { diff --git a/crates/milli/src/update/upgrade/v1_12.rs b/crates/milli/src/update/upgrade/v1_12.rs index e48ecfe36..c4f8578d0 100644 --- a/crates/milli/src/update/upgrade/v1_12.rs +++ b/crates/milli/src/update/upgrade/v1_12.rs @@ -33,9 +33,33 @@ impl UpgradeIndex for V1_12_To_V1_12_3 { } #[allow(non_camel_case_types)] -pub(super) struct V1_12_3_To_Current(); +pub(super) struct V1_12_3_To_V1_13_0(); -impl UpgradeIndex for V1_12_3_To_Current { +impl UpgradeIndex for V1_12_3_To_V1_13_0 { + fn upgrade( + &self, + _wtxn: &mut RwTxn, + _index: &Index, + _original: (u32, u32, u32), + _progress: Progress, + ) -> Result { + // recompute the indexes stats + Ok(true) + } + + fn target_version(&self) -> (u32, u32, u32) { + ( + VERSION_MAJOR.parse().unwrap(), + VERSION_MINOR.parse().unwrap(), + VERSION_PATCH.parse().unwrap(), + ) + } +} + +#[allow(non_camel_case_types)] +pub(super) struct V1_13_0_To_Current(); + +impl UpgradeIndex for V1_13_0_To_Current { fn upgrade( &self, _wtxn: &mut RwTxn,