Add document database stats

This commit is contained in:
ManyTheFish 2025-02-10 14:10:13 +01:00 committed by Kerollmops
parent 91a8a97045
commit 9a6c1730aa
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
9 changed files with 145 additions and 6 deletions

View File

@ -6,6 +6,7 @@ use std::{fs, thread};
use meilisearch_types::heed::types::{SerdeJson, Str};
use meilisearch_types::heed::{Database, Env, RoTxn, RwTxn};
use meilisearch_types::milli;
use meilisearch_types::milli::database_stats::DatabaseStats;
use meilisearch_types::milli::update::IndexerConfig;
use meilisearch_types::milli::{FieldDistribution, Index};
use serde::{Deserialize, Serialize};
@ -98,8 +99,9 @@ pub enum IndexStatus {
/// The statistics that can be computed from an `Index` object.
#[derive(Serialize, Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Stats of the documents database.
#[serde(default)]
pub documents_database_stats: DatabaseStats,
/// Size taken up by the index' DB, in bytes.
///
/// This includes the size taken by both the used and free pages of the DB, and as the free pages
@ -138,9 +140,9 @@ impl IndexStats {
pub fn new(index: &Index, rtxn: &RoTxn) -> milli::Result<Self> {
let arroy_stats = index.arroy_stats(rtxn)?;
Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?,
number_of_embeddings: Some(arroy_stats.number_of_embeddings),
number_of_embedded_documents: Some(arroy_stats.documents.len()),
documents_database_stats: index.documents_database_stats(rtxn)?,
database_size: index.on_disk_size()?,
used_database_size: index.used_size()?,
primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),

View File

@ -370,7 +370,8 @@ pub fn snapshot_index_mapper(rtxn: &RoTxn, mapper: &IndexMapper) -> String {
let stats = mapper.stats_of(rtxn, &name).unwrap();
s.push_str(&format!(
"{name}: {{ number_of_documents: {}, field_distribution: {:?} }}\n",
stats.number_of_documents, stats.field_distribution
stats.documents_database_stats.number_of_entries(),
stats.field_distribution
));
}

View File

@ -910,7 +910,15 @@ fn create_and_list_index() {
[
"kefir",
{
"number_of_documents": 0,
"documents_database_stats": {
"numberOfEntries": 0,
"totalKeySize": 0,
"totalValueSize": 0,
"maxKeySize": 0,
"maxValueSize": 0,
"minKeySize": 0,
"minValueSize": 0
},
"database_size": "[bytes]",
"number_of_embeddings": 0,
"number_of_embedded_documents": 0,

View File

@ -494,6 +494,12 @@ pub async fn delete_index(
pub struct IndexStats {
/// Number of documents in the index
pub number_of_documents: u64,
/// Size of the documents database, in bytes.
pub raw_document_db_size: u64,
/// Maximum size of a document in the documents database.
pub max_document_size: u64,
/// Average size of a document in the documents database.
pub avg_document_size: u64,
/// Whether or not the index is currently ingesting document
pub is_indexing: bool,
/// Number of embeddings in the index
@ -510,7 +516,10 @@ pub struct IndexStats {
impl From<index_scheduler::IndexStats> for IndexStats {
fn from(stats: index_scheduler::IndexStats) -> Self {
IndexStats {
number_of_documents: stats.inner_stats.number_of_documents,
number_of_documents: stats.inner_stats.documents_database_stats.number_of_entries(),
raw_document_db_size: stats.inner_stats.documents_database_stats.total_value_size(),
max_document_size: stats.inner_stats.documents_database_stats.max_value_size(),
avg_document_size: stats.inner_stats.documents_database_stats.average_value_size(),
is_indexing: stats.is_indexing,
number_of_embeddings: stats.inner_stats.number_of_embeddings,
number_of_embedded_documents: stats.inner_stats.number_of_embedded_documents,

View File

@ -160,6 +160,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 4,
"rawDocumentDbSize": 42,
"maxDocumentSize": 13,
"avgDocumentSize": 10,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@ -209,6 +212,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 2,
"rawDocumentDbSize": 16,
"maxDocumentSize": 12,
"avgDocumentSize": 8,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,
@ -277,6 +283,9 @@ async fn delete_document_by_filter() {
snapshot!(json_string!(stats), @r###"
{
"numberOfDocuments": 1,
"rawDocumentDbSize": 12,
"maxDocumentSize": 12,
"avgDocumentSize": 12,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@ -187,6 +187,9 @@ async fn import_dump_v1_movie_with_settings() {
@r###"
{
"numberOfDocuments": 53,
"rawDocumentDbSize": 21965,
"maxDocumentSize": 743,
"avgDocumentSize": 414,
"isIndexing": false,
"numberOfEmbeddings": 0,
"numberOfEmbeddedDocuments": 0,

View File

@ -0,0 +1,100 @@
use heed::types::Bytes;
use heed::Database;
use heed::RoTxn;
use serde::{Deserialize, Serialize};
use crate::Result;
#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Default)]
#[serde(rename_all = "camelCase")]
/// The stats of a database.
pub struct DatabaseStats {
/// The number of entries in the database.
number_of_entries: u64,
/// The total size of the keys in the database.
total_key_size: u64,
/// The total size of the values in the database.
total_value_size: u64,
/// The maximum size of a key in the database.
max_key_size: u64,
/// The maximum size of a value in the database.
max_value_size: u64,
/// The minimum size of a key in the database.
min_key_size: u64,
/// The minimum size of a value in the database.
min_value_size: u64,
}
impl DatabaseStats {
/// Returns the stats of the database.
///
/// This function iterates over the whole database and computes the stats.
/// It is not efficient and should be cached somewhere.
pub(crate) fn new<'a>(database: Database<Bytes, Bytes>, rtxn: &RoTxn<'a>) -> Result<Self> {
let mut database_stats = Self {
number_of_entries: 0,
total_key_size: 0,
total_value_size: 0,
max_key_size: 0,
max_value_size: 0,
min_key_size: u64::MAX,
min_value_size: u64::MAX,
};
let mut iter = database.iter(rtxn)?;
while let Some((key, value)) = iter.next().transpose()? {
let key_size = key.len() as u64;
let value_size = value.len() as u64;
database_stats.number_of_entries += 1;
database_stats.total_key_size += key_size;
database_stats.total_value_size += value_size;
database_stats.max_key_size = database_stats.max_key_size.max(key_size);
database_stats.max_value_size = database_stats.max_value_size.max(value_size);
database_stats.min_key_size = database_stats.min_key_size.min(key_size);
database_stats.min_value_size = database_stats.min_value_size.min(value_size);
}
if database_stats.number_of_entries == 0 {
database_stats.min_key_size = 0;
database_stats.min_value_size = 0;
}
Ok(database_stats)
}
pub fn average_key_size(&self) -> u64 {
self.total_key_size / self.number_of_entries
}
pub fn average_value_size(&self) -> u64 {
self.total_value_size / self.number_of_entries
}
pub fn number_of_entries(&self) -> u64 {
self.number_of_entries
}
pub fn total_key_size(&self) -> u64 {
self.total_key_size
}
pub fn total_value_size(&self) -> u64 {
self.total_value_size
}
pub fn max_key_size(&self) -> u64 {
self.max_key_size
}
pub fn max_value_size(&self) -> u64 {
self.max_value_size
}
pub fn min_key_size(&self) -> u64 {
self.min_key_size
}
pub fn min_value_size(&self) -> u64 {
self.min_value_size
}
}

View File

@ -11,6 +11,7 @@ use rstar::RTree;
use serde::{Deserialize, Serialize};
use crate::constants::{self, RESERVED_VECTORS_FIELD_NAME};
use crate::database_stats::DatabaseStats;
use crate::documents::PrimaryKey;
use crate::error::{InternalError, UserError};
use crate::fields_ids_map::FieldsIdsMap;
@ -403,6 +404,11 @@ impl Index {
Ok(count.unwrap_or_default())
}
/// Returns the stats of the database.
pub fn documents_database_stats(&self, rtxn: &RoTxn<'_>) -> Result<DatabaseStats> {
Ok(DatabaseStats::new(self.documents.remap_types::<Bytes, Bytes>(), rtxn)?)
}
/* primary key */
/// Writes the documents primary key, this is the field name that is used to store the id.

View File

@ -10,6 +10,7 @@ pub mod documents;
mod asc_desc;
mod criterion;
pub mod database_stats;
mod error;
mod external_documents_ids;
pub mod facet;