Add document database stats

2025-02-12 05:23:28 +01:00 · 2025-02-10 14:10:13 +01:00 · 2025-02-10 14:10:13 +01:00 · b5dc971afe
commit b5dc971afe
parent 4abf0db0b4
11 changed files with 177 additions and 12 deletions
--- a/crates/index-scheduler/src/index_mapper/mod.rs
+++ b/crates/index-scheduler/src/index_mapper/mod.rs
@ -6,6 +6,7 @@ use std::{fs, thread};
 use meilisearch_types::heed::types::{SerdeJson, Str};
 use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
 use meilisearch_types::milli;
 use meilisearch_types::milli::database_stats::DatabaseStats;
 use meilisearch_types::milli::update::IndexerConfig;
 use meilisearch_types::milli::{FieldDistribution, Index};
 use serde::{Deserialize, Serialize};
@ -98,8 +99,9 @@ pub enum IndexStatus {
 /// The statistics that can be computed from an `Index` object.
 #[derive(Serialize, Deserialize, Debug)]
 pub struct IndexStats {
-    /// Number of documents in the index.
+    /// Stats of the documents database.
-    pub number_of_documents: u64,
+    #[serde(default)]
    pub documents_database_stats: DatabaseStats,
    /// Size taken up by the index' DB, in bytes.
    ///
    /// This includes the size taken by both the used and free pages of the DB, and as the free pages
@ -131,7 +133,7 @@ impl IndexStats {
    /// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
    pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
        Ok(IndexStats {
-            number_of_documents: index.number_of_documents(rtxn)?,
+            documents_database_stats: index.documents_database_stats(rtxn)?,
            database_size: index.on_disk_size()?,
            used_database_size: index.used_size()?,
            primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),
--- a/crates/index-scheduler/src/insta_snapshot.rs
+++ b/crates/index-scheduler/src/insta_snapshot.rs
@ -365,7 +365,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String {
        let stats = mapper.stats_of(rtxn, &name).unwrap();
        s.push_str(&format!(
            "{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n",
-            stats.number_of_documents, stats.field_distribution
+            stats.documents_database_stats.number_of_entries(),
            stats.field_distribution
        ));
    }
--- a/crates/index-scheduler/src/scheduler/test.rs
+++ b/crates/index-scheduler/src/scheduler/test.rs
@ -903,14 +903,22 @@ fn create_and_list_index() {
    index_scheduler.index("kefir").unwrap();
    let list = index_scheduler.get_paginated_indexes_stats(&AuthFilter::default(), 0, 20).unwrap();
-    snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r#"
+    snapshot!(json_string!(list, { "[1][0][1].created_at" => "[date]", "[1][0][1].updated_at" => "[date]", "[1][0][1].used_database_size" => "[bytes]", "[1][0][1].database_size" => "[bytes]" }), @r###"
    [
      1,
      [
        [
          "kefir",
          {
-            "number_of_documents": 0,
+            "documents_database_stats": {
              "numberOfEntries": 0,
              "totalKeySize": 0,
              "totalValueSize": 0,
              "maxKeySize": 0,
              "maxValueSize": 0,
              "minKeySize": 0,
              "minValueSize": 0
            },
            "database_size": "[bytes]",
            "used_database_size": "[bytes]",
            "primary_key": null,
@ -921,5 +929,5 @@ fn create_and_list_index() {
        ]
      ]
    ]
-    "#);
+    "###);
 }
--- a/crates/meilisearch/src/routes/indexes/mod.rs
+++ b/crates/meilisearch/src/routes/indexes/mod.rs
@ -494,6 +494,12 @@ pub async fn delete_index(
 pub struct IndexStats {
    /// Number of documents in the index
    pub number_of_documents: u64,
    /// Size of the documents database, in bytes.
    pub raw_document_db_size: u64,
    /// Maximum size of a document in the documents database.
    pub max_document_size: u64,
    /// Average size of a document in the documents database.
    pub avg_document_size: u64,
    /// Whether or not the index is currently ingesting document
    pub is_indexing: bool,
    /// Association of every field name with the number of times it occurs in the documents.
@ -504,7 +510,10 @@ pub struct IndexStats {
 impl From<index_scheduler::IndexStats> for IndexStats {
    fn from(stats: index_scheduler::IndexStats) -> Self {
        IndexStats {
-            number_of_documents: stats.inner_stats.number_of_documents,
+            number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(),
            raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(),
            max_document_size: stats.inner_stats.documents_database_stats.max_value_size(),
            avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(),
            is_indexing: stats.is_indexing,
            field_distribution: stats.inner_stats.field_distribution,
        }
--- a/crates/meilisearch/tests/documents/delete_documents.rs
+++ b/crates/meilisearch/tests/documents/delete_documents.rs
@ -160,6 +160,9 @@ async fn delete_document_by_filter() {
    snapshot!(json_string!(stats), @r###"
    {
      "numberOfDocuments": 4,
      "rawDocumentDbSize": 42,
      "maxDocumentSize": 13,
      "avgDocumentSize": 10,
      "isIndexing": false,
      "fieldDistribution": {
        "color": 3,
@ -207,6 +210,9 @@ async fn delete_document_by_filter() {
    snapshot!(json_string!(stats), @r###"
    {
      "numberOfDocuments": 2,
      "rawDocumentDbSize": 16,
      "maxDocumentSize": 12,
      "avgDocumentSize": 8,
      "isIndexing": false,
      "fieldDistribution": {
        "color": 1,
@ -273,6 +279,9 @@ async fn delete_document_by_filter() {
    snapshot!(json_string!(stats), @r###"
    {
      "numberOfDocuments": 1,
      "rawDocumentDbSize": 12,
      "maxDocumentSize": 12,
      "avgDocumentSize": 12,
      "isIndexing": false,
      "fieldDistribution": {
        "color": 1,
--- a/crates/meilisearch/tests/dumps/mod.rs
+++ b/crates/meilisearch/tests/dumps/mod.rs
@ -172,6 +172,9 @@ async fn import_dump_v1_movie_with_settings() {
        @r###"
    {
      "numberOfDocuments": 53,
      "rawDocumentDbSize": 21965,
      "maxDocumentSize": 743,
      "avgDocumentSize": 414,
      "isIndexing": false,
      "fieldDistribution": {
        "genres": 53,
--- a/crates/milli/src/database_stats.rs
+++ b/crates/milli/src/database_stats.rs
@ -0,0 +1,100 @@
 use heed::types::Bytes;
 use heed::Database;
 use heed::RoTxn;
 use serde::{Deserialize, Serialize};
 use crate::Result;
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
 #[serde(rename_all = "camelCase")]
 /// The stats of a database.
 pub struct DatabaseStats {
    /// The number of entries in the database.
    number_of_entries: u64,
    /// The total size of the keys in the database.
    total_key_size: u64,
    /// The total size of the values in the database.
    total_value_size: u64,
    /// The maximum size of a key in the database.
    max_key_size: u64,
    /// The maximum size of a value in the database.
    max_value_size: u64,
    /// The minimum size of a key in the database.
    min_key_size: u64,
    /// The minimum size of a value in the database.
    min_value_size: u64,
 }
 impl DatabaseStats {
    /// Returns the stats of the database.
    ///
    /// This function iterates over the whole database and computes the stats.
    /// It is not efficient and should be cached somewhere.
    pub(crate) fn new<'a>(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'a>) -> Result<Self> {
        let mut database_stats = Self {
            number_of_entries: 0,
            total_key_size: 0,
            total_value_size: 0,
            max_key_size: 0,
            max_value_size: 0,
            min_key_size: u64::MAX,
            min_value_size: u64::MAX,
        };
        let mut iter = database.iter(rtxn)?;
        while let Some((key, value)) = iter.next().transpose()? {
            let key_size = key.len() as u64;
            let value_size = value.len() as u64;
            database_stats.number_of_entries += 1;
            database_stats.total_key_size += key_size;
            database_stats.total_value_size += value_size;
            database_stats.max_key_size = database_stats.max_key_size.max(key_size);
            database_stats.max_value_size = database_stats.max_value_size.max(value_size);
            database_stats.min_key_size = database_stats.min_key_size.min(key_size);
            database_stats.min_value_size = database_stats.min_value_size.min(value_size);
        }
        if database_stats.number_of_entries == 0 {
            database_stats.min_key_size = 0;
            database_stats.min_value_size = 0;
        }
        Ok(database_stats)
    }
    pub fn average_key_size(&self) -> u64 {
        self.total_key_size / self.number_of_entries
    }
    pub fn average_value_size(&self) -> u64 {
        self.total_value_size / self.number_of_entries
    }
    pub fn number_of_entries(&self) -> u64 {
        self.number_of_entries
    }
    pub fn total_key_size(&self) -> u64 {
        self.total_key_size
    }
    pub fn total_value_size(&self) -> u64 {
        self.total_value_size
    }
    pub fn max_key_size(&self) -> u64 {
        self.max_key_size
    }
    pub fn max_value_size(&self) -> u64 {
        self.max_value_size
    }
    pub fn min_key_size(&self) -> u64 {
        self.min_key_size
    }
    pub fn min_value_size(&self) -> u64 {
        self.min_value_size
    }
 }
--- a/crates/milli/src/index.rs
+++ b/crates/milli/src/index.rs
@ -11,6 +11,7 @@ use rstar::RTree;
 use serde::{Deserialize, Serialize};
 use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME};
 use crate::database_stats::DatabaseStats;
 use crate::documents::PrimaryKey;
 use crate::error::{InternalError, UserError};
 use crate::fields_ids_map::FieldsIdsMap;
@ -403,6 +404,11 @@ impl Index {
        Ok(count.unwrap_or_default())
    }
    /// Returns the stats of the database.
    pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result<DatabaseStats> {
        Ok(DatabaseStats::new(self.documents.remap_types::<Bytes, Bytes>(), rtxn)?)
    }
    /* primary key */
    /// Writes the documents primary key, this is the field name that is used to store the id.
--- a/crates/milli/src/lib.rs
+++ b/crates/milli/src/lib.rs
@ -10,6 +10,7 @@ pub mod documents;
 mod asc_desc;
 mod criterion;
 pub mod database_stats;
 mod error;
 mod external_documents_ids;
 pub mod facet;
--- a/crates/milli/src/update/upgrade/mod.rs
+++ b/crates/milli/src/update/upgrade/mod.rs
@ -1,7 +1,7 @@
 mod v1_12;
 use heed::RwTxn;
-use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3};
+use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3, V1_13_0_To_Current};
 use crate::progress::{Progress, VariableNameStep};
 use crate::{Index, InternalError, Result};
@ -26,11 +26,13 @@ pub fn upgrade(
    progress: Progress,
 ) -> Result<bool> {
    let from = index.get_version(wtxn)?.unwrap_or(db_version);
-    let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()];
+    let upgrade_functions: &[&dyn UpgradeIndex] =
        &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0(), &V1_13_0_To_Current()];
    let start = match from {
        (1, 12, 0..=2) => 0,
        (1, 12, 3..) => 1,
        (1, 13, 0) => 2,
        // We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
        (1, 13, _) => return Ok(false),
        (major, minor, patch) => {
--- a/crates/milli/src/update/upgrade/v1_12.rs
+++ b/crates/milli/src/update/upgrade/v1_12.rs
@ -33,9 +33,33 @@ impl UpgradeIndex for V1_12_To_V1_12_3 {
 }
 #[allow(non_camel_case_types)]
-pub(super) struct V1_12_3_To_Current();
+pub(super) struct V1_12_3_To_V1_13_0();
-impl UpgradeIndex for V1_12_3_To_Current {
+impl UpgradeIndex for V1_12_3_To_V1_13_0 {
    fn upgrade(
        &self,
        _wtxn: &mut RwTxn,
        _index: &Index,
        _original: (u32, u32, u32),
        _progress: Progress,
    ) -> Result<bool> {
        // recompute the indexes stats
        Ok(true)
    }
    fn target_version(&self) -> (u32, u32, u32) {
        (
            VERSION_MAJOR.parse().unwrap(),
            VERSION_MINOR.parse().unwrap(),
            VERSION_PATCH.parse().unwrap(),
        )
    }
 }
 #[allow(non_camel_case_types)]
 pub(super) struct V1_13_0_To_Current();
 impl UpgradeIndex for V1_13_0_To_Current {
    fn upgrade(
        &self,
        _wtxn: &mut RwTxn,