Add document database stats

This commit is contained in:
ManyTheFish 2025-02-10 14:10:13 +01:00
parent 4abf0db0b4
commit b5dc971afe
11 changed files with 177 additions and 12 deletions

View File

@ -6,6 +6,7 @@ use std::{fs, thread};
use meilisearch_types::heed::types::{SerdeJson, Str}; use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn}; use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
use meilisearch_types::milli; use meilisearch_types::milli;
use meilisearch_types::milli::database_stats::DatabaseStats;
use meilisearch_types::milli::update::IndexerConfig; use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::{FieldDistribution, Index}; use meilisearch_types::milli::{FieldDistribution, Index};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@ -98,8 +99,9 @@ pub enum IndexStatus {
/// The statistics that can be computed from an `Index` object. /// The statistics that can be computed from an `Index` object.
#[derive(Serialize, Deserialize, Debug)] #[derive(Serialize, Deserialize, Debug)]
pub struct IndexStats { pub struct IndexStats {
/// Number of documents in the index. /// Stats of the documents database.
pub number_of_documents: u64, #[serde(default)]
pub documents_database_stats: DatabaseStats,
/// Size taken up by the index' DB, in bytes. /// Size taken up by the index' DB, in bytes.
/// ///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages /// This includes the size taken by both the used and free pages of the DB, and as the free pages
@ -131,7 +133,7 @@ impl IndexStats {
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`. /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> { pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
Ok(IndexStats { Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?, documents_database_stats: index.documents_database_stats(rtxn)?,
database_size: index.on_disk_size()?, database_size: index.on_disk_size()?,
used_database_size: index.used_size()?, used_database_size: index.used_size()?,
primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()), primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),

View File

@ -365,7 +365,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String {
let stats = mapper.stats_of(rtxn, &name).unwrap(); let stats = mapper.stats_of(rtxn, &name).unwrap();
s.push_str(&format!( s.push_str(&format!(
"{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n", "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n",
stats.number_of_documents, stats.field_distribution stats.documents_database_stats.number_of_entries(),
stats.field_distribution
)); ));
} }

View File

@ -903,14 +903,22 @@ fn create_and_list_index() {
index_scheduler.index("kefir").unwrap(); index_scheduler.index("kefir").unwrap();
let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap(); let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap();
snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r#" snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###"
[ [
1, 1,
[ [
[ [
"kefir", "kefir",
{ {
"number_of_documents": 0, "documents_database_stats": {
"numberOfEntries": 0,
"totalKeySize": 0,
"totalValueSize": 0,
"maxKeySize": 0,
"maxValueSize": 0,
"minKeySize": 0,
"minValueSize": 0
},
"database_size": "[bytes]", "database_size": "[bytes]",
"used_database_size": "[bytes]", "used_database_size": "[bytes]",
"primary_key": null, "primary_key": null,
@ -921,5 +929,5 @@ fn create_and_list_index() {
] ]
] ]
] ]
"#); "###);
} }

View File

@ -494,6 +494,12 @@ pub async fn delete_index(
pub struct IndexStats { pub struct IndexStats {
/// Number of documents in the index /// Number of documents in the index
pub number_of_documents: u64, pub number_of_documents: u64,
/// Size of the documents database, in bytes.
pub raw_document_db_size: u64,
/// Maximum size of a document in the documents database.
pub max_document_size: u64,
/// Average size of a document in the documents database.
pub avg_document_size: u64,
/// Whether or not the index is currently ingesting document /// Whether or not the index is currently ingesting document
pub is_indexing: bool, pub is_indexing: bool,
/// Association of every field name with the number of times it occurs in the documents. /// Association of every field name with the number of times it occurs in the documents.
@ -504,7 +510,10 @@ pub struct IndexStats {
impl From<index_scheduler::IndexStats> for IndexStats { impl From<index_scheduler::IndexStats> for IndexStats {
fn from(stats: index_scheduler::IndexStats) -> Self { fn from(stats: index_scheduler::IndexStats) -> Self {
IndexStats { IndexStats {
number_of_documents: stats.inner_stats.number_of_documents, number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(),
raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(),
max_document_size: stats.inner_stats.documents_database_stats.max_value_size(),
avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(),
is_indexing: stats.is_indexing, is_indexing: stats.is_indexing,
field_distribution: stats.inner_stats.field_distribution, field_distribution: stats.inner_stats.field_distribution,
} }

View File

@ -160,6 +160,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###" snapshot!(json_string!(stats), @r###"
{ {
"numberOfDocuments": 4, "numberOfDocuments": 4,
"rawDocumentDbSize": 42,
"maxDocumentSize": 13,
"avgDocumentSize": 10,
"isIndexing": false, "isIndexing": false,
"fieldDistribution": { "fieldDistribution": {
"color": 3, "color": 3,
@ -207,6 +210,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###" snapshot!(json_string!(stats), @r###"
{ {
"numberOfDocuments": 2, "numberOfDocuments": 2,
"rawDocumentDbSize": 16,
"maxDocumentSize": 12,
"avgDocumentSize": 8,
"isIndexing": false, "isIndexing": false,
"fieldDistribution": { "fieldDistribution": {
"color": 1, "color": 1,
@ -273,6 +279,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###" snapshot!(json_string!(stats), @r###"
{ {
"numberOfDocuments": 1, "numberOfDocuments": 1,
"rawDocumentDbSize": 12,
"maxDocumentSize": 12,
"avgDocumentSize": 12,
"isIndexing": false, "isIndexing": false,
"fieldDistribution": { "fieldDistribution": {
"color": 1, "color": 1,

View File

@ -172,6 +172,9 @@ async fn import_dump_v1_movie_with_settings() {
@r###" @r###"
{ {
"numberOfDocuments": 53, "numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"maxDocumentSize": 743,
"avgDocumentSize": 414,
"isIndexing": false, "isIndexing": false,
"fieldDistribution": { "fieldDistribution": {
"genres": 53, "genres": 53,

View File

@ -0,0 +1,100 @@
use heed::types::Bytes;
use heed::Database;
use heed::RoTxn;
use serde::{Deserialize, Serialize};
use crate::Result;
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
/// The stats of a database.
pub struct DatabaseStats {
/// The number of entries in the database.
number_of_entries: u64,
/// The total size of the keys in the database.
total_key_size: u64,
/// The total size of the values in the database.
total_value_size: u64,
/// The maximum size of a key in the database.
max_key_size: u64,
/// The maximum size of a value in the database.
max_value_size: u64,
/// The minimum size of a key in the database.
min_key_size: u64,
/// The minimum size of a value in the database.
min_value_size: u64,
}
impl DatabaseStats {
/// Returns the stats of the database.
///
/// This function iterates over the whole database and computes the stats.
/// It is not efficient and should be cached somewhere.
pub(crate) fn new<'a>(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'a>) -> Result<Self> {
let mut database_stats = Self {
number_of_entries: 0,
total_key_size: 0,
total_value_size: 0,
max_key_size: 0,
max_value_size: 0,
min_key_size: u64::MAX,
min_value_size: u64::MAX,
};
let mut iter = database.iter(rtxn)?;
while let Some((key, value)) = iter.next().transpose()? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
database_stats.number_of_entries += 1;
database_stats.total_key_size += key_size;
database_stats.total_value_size += value_size;
database_stats.max_key_size = database_stats.max_key_size.max(key_size);
database_stats.max_value_size = database_stats.max_value_size.max(value_size);
database_stats.min_key_size = database_stats.min_key_size.min(key_size);
database_stats.min_value_size = database_stats.min_value_size.min(value_size);
}
if database_stats.number_of_entries == 0 {
database_stats.min_key_size = 0;
database_stats.min_value_size = 0;
}
Ok(database_stats)
}
pub fn average_key_size(&self) -> u64 {
self.total_key_size / self.number_of_entries
}
pub fn average_value_size(&self) -> u64 {
self.total_value_size / self.number_of_entries
}
pub fn number_of_entries(&self) -> u64 {
self.number_of_entries
}
pub fn total_key_size(&self) -> u64 {
self.total_key_size
}
pub fn total_value_size(&self) -> u64 {
self.total_value_size
}
pub fn max_key_size(&self) -> u64 {
self.max_key_size
}
pub fn max_value_size(&self) -> u64 {
self.max_value_size
}
pub fn min_key_size(&self) -> u64 {
self.min_key_size
}
pub fn min_value_size(&self) -> u64 {
self.min_value_size
}
}

View File

@ -11,6 +11,7 @@ use rstar::RTree;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME}; use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME};
use crate::database_stats::DatabaseStats;
use crate::documents::PrimaryKey; use crate::documents::PrimaryKey;
use crate::error::{InternalError, UserError}; use crate::error::{InternalError, UserError};
use crate::fields_ids_map::FieldsIdsMap; use crate::fields_ids_map::FieldsIdsMap;
@ -403,6 +404,11 @@ impl Index {
Ok(count.unwrap_or_default()) Ok(count.unwrap_or_default())
} }
/// Returns the stats of the database.
pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result<DatabaseStats> {
Ok(DatabaseStats::new(self.documents.remap_types::<Bytes, Bytes>(), rtxn)?)
}
/* primary key */ /* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id. /// Writes the documents primary key, this is the field name that is used to store the id.

View File

@ -10,6 +10,7 @@ pub mod documents;
mod asc_desc; mod asc_desc;
mod criterion; mod criterion;
pub mod database_stats;
mod error; mod error;
mod external_documents_ids; mod external_documents_ids;
pub mod facet; pub mod facet;

View File

@ -1,7 +1,7 @@
mod v1_12; mod v1_12;
use heed::RwTxn; use heed::RwTxn;
use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3}; use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3, V1_13_0_To_Current};
use crate::progress::{Progress, VariableNameStep}; use crate::progress::{Progress, VariableNameStep};
use crate::{Index, InternalError, Result}; use crate::{Index, InternalError, Result};
@ -26,11 +26,13 @@ pub fn upgrade(
progress: Progress, progress: Progress,
) -> Result<bool> { ) -> Result<bool> {
let from = index.get_version(wtxn)?.unwrap_or(db_version); let from = index.get_version(wtxn)?.unwrap_or(db_version);
let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()]; let upgrade_functions: &[&dyn UpgradeIndex] =
&[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0(), &V1_13_0_To_Current()];
let start = match from { let start = match from {
(1, 12, 0..=2) => 0, (1, 12, 0..=2) => 0,
(1, 12, 3..) => 1, (1, 12, 3..) => 1,
(1, 13, 0) => 2,
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other. // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 13, _) => return Ok(false), (1, 13, _) => return Ok(false),
(major, minor, patch) => { (major, minor, patch) => {

View File

@ -33,9 +33,33 @@ impl UpgradeIndex for V1_12_To_V1_12_3 {
} }
#[allow(non_camel_case_types)] #[allow(non_camel_case_types)]
pub(super) struct V1_12_3_To_Current(); pub(super) struct V1_12_3_To_V1_13_0();
impl UpgradeIndex for V1_12_3_To_Current { impl UpgradeIndex for V1_12_3_To_V1_13_0 {
fn upgrade(
&self,
_wtxn: &mut RwTxn,
_index: &Index,
_original: (u32, u32, u32),
_progress: Progress,
) -> Result<bool> {
// recompute the indexes stats
Ok(true)
}
fn target_version(&self) -> (u32, u32, u32) {
(
VERSION_MAJOR.parse().unwrap(),
VERSION_MINOR.parse().unwrap(),
VERSION_PATCH.parse().unwrap(),
)
}
}
#[allow(non_camel_case_types)]
pub(super) struct V1_13_0_To_Current();
impl UpgradeIndex for V1_13_0_To_Current {
fn upgrade( fn upgrade(
&self, &self,
_wtxn: &mut RwTxn, _wtxn: &mut RwTxn,