From db7ce037634989ebce6040c1d291d7022924b395 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 26 Mar 2025 17:13:09 +0100 Subject: [PATCH] Improve the performances of computing the size of the documents database --- crates/meilisearch/src/routes/indexes/mod.rs | 2 +- crates/milli/src/database_stats.rs | 77 +++++++------------ crates/milli/src/index.rs | 32 -------- .../milli/src/update/index_documents/mod.rs | 4 +- crates/milli/src/update/new/indexer/mod.rs | 1 - crates/milli/src/update/new/indexer/write.rs | 5 +- 6 files changed, 33 insertions(+), 88 deletions(-) diff --git a/crates/meilisearch/src/routes/indexes/mod.rs b/crates/meilisearch/src/routes/indexes/mod.rs index 5aebf5cac..48ed1cfb1 100644 --- a/crates/meilisearch/src/routes/indexes/mod.rs +++ b/crates/meilisearch/src/routes/indexes/mod.rs @@ -518,7 +518,7 @@ impl From for IndexStats { .inner_stats .number_of_documents .unwrap_or(stats.inner_stats.documents_database_stats.number_of_entries()), - raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(), + raw_document_db_size: stats.inner_stats.documents_database_stats.total_size(), avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(), is_indexing: stats.is_indexing, number_of_embeddings: stats.inner_stats.number_of_embeddings, diff --git a/crates/milli/src/database_stats.rs b/crates/milli/src/database_stats.rs index d97dc13ba..7da1fbd2b 100644 --- a/crates/milli/src/database_stats.rs +++ b/crates/milli/src/database_stats.rs @@ -1,8 +1,13 @@ -use heed::types::Bytes; +use std::mem; + use heed::Database; +use heed::DatabaseStat; use heed::RoTxn; +use heed::Unspecified; use serde::{Deserialize, Serialize}; +use crate::BEU32; + #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)] #[serde(rename_all = "camelCase")] /// The stats of a database. @@ -20,58 +25,24 @@ impl DatabaseStats { /// /// This function iterates over the whole database and computes the stats. /// It is not efficient and should be cached somewhere. - pub(crate) fn new(database: Database, rtxn: &RoTxn<'_>) -> heed::Result { - let mut database_stats = - Self { number_of_entries: 0, total_key_size: 0, total_value_size: 0 }; + pub(crate) fn new( + database: Database, + rtxn: &RoTxn<'_>, + ) -> heed::Result { + let DatabaseStat { page_size, depth: _, branch_pages, leaf_pages, overflow_pages, entries } = + database.stat(rtxn)?; - let mut iter = database.iter(rtxn)?; - while let Some((key, value)) = iter.next().transpose()? { - let key_size = key.len() as u64; - let value_size = value.len() as u64; - database_stats.total_key_size += key_size; - database_stats.total_value_size += value_size; - } + // We first take the total size without overflow pages as the overflow pages contains the values and only that. + let total_size = (branch_pages + leaf_pages + overflow_pages) * page_size as usize; + // We compute an estimated size for the keys. + let total_key_size = entries * (mem::size_of::() + 4); + let total_value_size = total_size - total_key_size; - database_stats.number_of_entries = database.len(rtxn)?; - - Ok(database_stats) - } - - /// Recomputes the stats of the database and returns the new stats. - /// - /// This function is used to update the stats of the database when some keys are modified. - /// It is more efficient than the `new` function because it does not iterate over the whole database but only the modified keys comparing the before and after states. - pub(crate) fn recompute( - mut stats: Self, - database: Database, - before_rtxn: &RoTxn<'_>, - after_rtxn: &RoTxn<'_>, - modified_keys: I, - ) -> heed::Result - where - I: IntoIterator, - K: AsRef<[u8]>, - { - for key in modified_keys { - let key = key.as_ref(); - if let Some(value) = database.get(after_rtxn, key)? { - let key_size = key.len() as u64; - let value_size = value.len() as u64; - stats.total_key_size = stats.total_key_size.saturating_add(key_size); - stats.total_value_size = stats.total_value_size.saturating_add(value_size); - } - - if let Some(value) = database.get(before_rtxn, key)? { - let key_size = key.len() as u64; - let value_size = value.len() as u64; - stats.total_key_size = stats.total_key_size.saturating_sub(key_size); - stats.total_value_size = stats.total_value_size.saturating_sub(value_size); - } - } - - stats.number_of_entries = database.len(after_rtxn)?; - - Ok(stats) + Ok(Self { + number_of_entries: entries as u64, + total_key_size: total_key_size as u64, + total_value_size: total_value_size as u64, + }) } pub fn average_key_size(&self) -> u64 { @@ -86,6 +57,10 @@ impl DatabaseStats { self.number_of_entries } + pub fn total_size(&self) -> u64 { + self.total_key_size + self.total_value_size + } + pub fn total_key_size(&self) -> u64 { self.total_key_size } diff --git a/crates/milli/src/index.rs b/crates/milli/src/index.rs index a2d839d03..5f74863e8 100644 --- a/crates/milli/src/index.rs +++ b/crates/milli/src/index.rs @@ -411,38 +411,6 @@ impl Index { Ok(count.unwrap_or_default()) } - /// Updates the stats of the documents database based on the previous stats and the modified docids. - pub fn update_documents_stats( - &self, - wtxn: &mut RwTxn<'_>, - modified_docids: roaring::RoaringBitmap, - ) -> Result<()> { - let before_rtxn = self.read_txn()?; - let document_stats = match self.documents_stats(&before_rtxn)? { - Some(before_stats) => DatabaseStats::recompute( - before_stats, - self.documents.remap_types(), - &before_rtxn, - wtxn, - modified_docids.iter().map(|docid| docid.to_be_bytes()), - )?, - None => { - // This should never happen when there are already documents in the index, the documents stats should be present. - // If it happens, it means that the index was not properly initialized/upgraded. - debug_assert_eq!( - self.documents.len(&before_rtxn)?, - 0, - "The documents stats should be present when there are documents in the index" - ); - tracing::warn!("No documents stats found, creating new ones"); - DatabaseStats::new(self.documents.remap_types(), &*wtxn)? - } - }; - - self.put_documents_stats(wtxn, document_stats)?; - Ok(()) - } - /// Writes the stats of the documents database. pub fn put_documents_stats( &self, diff --git a/crates/milli/src/update/index_documents/mod.rs b/crates/milli/src/update/index_documents/mod.rs index 95342054d..5d445d283 100644 --- a/crates/milli/src/update/index_documents/mod.rs +++ b/crates/milli/src/update/index_documents/mod.rs @@ -28,6 +28,7 @@ pub use self::helpers::*; pub use self::transform::{Transform, TransformOutput}; use super::facet::clear_facet_levels_based_on_settings_diff; use super::new::StdResult; +use crate::database_stats::DatabaseStats; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::{Error, InternalError}; use crate::index::{PrefixSearch, PrefixSettings}; @@ -476,7 +477,8 @@ where if !settings_diff.settings_update_only { // Update the stats of the documents database when there is a document update. - self.index.update_documents_stats(self.wtxn, modified_docids)?; + let stats = DatabaseStats::new(self.index.documents.remap_data_type(), self.wtxn)?; + self.index.put_documents_stats(self.wtxn, stats)?; } // We write the field distribution into the main database self.index.put_field_distribution(self.wtxn, &field_distribution)?; diff --git a/crates/milli/src/update/new/indexer/mod.rs b/crates/milli/src/update/new/indexer/mod.rs index 4f2dd19c9..d2a88f4ff 100644 --- a/crates/milli/src/update/new/indexer/mod.rs +++ b/crates/milli/src/update/new/indexer/mod.rs @@ -234,7 +234,6 @@ where embedders, field_distribution, document_ids, - modified_docids, )?; Ok(congestion) diff --git a/crates/milli/src/update/new/indexer/write.rs b/crates/milli/src/update/new/indexer/write.rs index 8618b4b21..7ab7991b2 100644 --- a/crates/milli/src/update/new/indexer/write.rs +++ b/crates/milli/src/update/new/indexer/write.rs @@ -7,6 +7,7 @@ use rand::SeedableRng as _; use time::OffsetDateTime; use super::super::channel::*; +use crate::database_stats::DatabaseStats; use crate::documents::PrimaryKey; use crate::fields_ids_map::metadata::FieldIdMapWithMetadata; use crate::index::IndexEmbeddingConfig; @@ -142,7 +143,6 @@ pub(super) fn update_index( embedders: EmbeddingConfigs, field_distribution: std::collections::BTreeMap, document_ids: roaring::RoaringBitmap, - modified_docids: roaring::RoaringBitmap, ) -> Result<()> { index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?; if let Some(new_primary_key) = new_primary_key { @@ -153,7 +153,8 @@ pub(super) fn update_index( index.put_field_distribution(wtxn, &field_distribution)?; index.put_documents_ids(wtxn, &document_ids)?; index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; - index.update_documents_stats(wtxn, modified_docids)?; + let stats = DatabaseStats::new(index.documents.remap_data_type(), wtxn)?; + index.put_documents_stats(wtxn, stats)?; Ok(()) }