Merge pull request #189 from meilisearch/documents-fields-repartition

Add the documents fields repartition into stats
2025-07-04 04:17:10 +02:00 · 2019-09-17 16:23:49 +02:00 · 2019-09-17 16:23:49 +02:00 · c10b701b9a
commit c10b701b9a
parent 97cf5cca2a 80caa8b60d
4 changed files with 105 additions and 3 deletions
--- a/meilidb-data/Cargo.toml
+++ b/meilidb-data/Cargo.toml
@ -37,3 +37,4 @@ branch = "arc-byte-slice"
 [dev-dependencies]
 tempfile = "3.1.0"
 maplit = "1.0.2"
--- a/meilidb-data/src/database/index/documents_index.rs
+++ b/meilidb-data/src/database/index/documents_index.rs
@ -1,7 +1,8 @@
 use std::convert::TryInto;
 use std::collections::HashMap;
 use meilidb_core::DocumentId;
-use meilidb_schema::SchemaAttr;
+use meilidb_schema::{Schema, SchemaAttr};
 use rocksdb::DBVector;
 use crate::document_attr_key::DocumentAttrKey;
@ -54,6 +55,20 @@ impl DocumentsIndex {
        Ok(DocumentFieldsIter(iter))
    }
    pub fn documents_fields_repartition(&self, schema: Schema) -> RocksDbResult<HashMap<String, u64>> {
        let iter = self.0.iter()?;
        let mut repartition_attributes_id = HashMap::new();
        for key in DocumentsKeysIter(iter) {
            let counter = repartition_attributes_id.entry(key.attribute).or_insert(0);
            *counter += 1u64;
        }
        let mut repartition_with_attribute_name = HashMap::new();
        for (key, val) in repartition_attributes_id {
            repartition_with_attribute_name.insert(schema.attribute_name(key).to_owned(), val);
        }
        Ok(repartition_with_attribute_name)
    }
    pub fn len(&self) -> RocksDbResult<u64> {
        let mut last_document_id = None;
        let mut count = 0;
@ -88,3 +103,20 @@ impl Iterator for DocumentFieldsIter<'_> {
        }
    }
 }
 pub struct DocumentsKeysIter<'a>(crate::CfIter<'a>);
 impl Iterator for DocumentsKeysIter<'_> {
    type Item = DocumentAttrKey;
    fn next(&mut self) -> Option<Self::Item> {
        match self.0.next() {
            Some((key, _)) => {
                let array = key.as_ref().try_into().unwrap();
                let key = DocumentAttrKey::from_be_bytes(array);
                Some(key)
            },
            None => None,
        }
    }
 }
--- a/meilidb-data/src/database/index/mod.rs
+++ b/meilidb-data/src/database/index/mod.rs
@ -1,4 +1,4 @@
-use std::collections::{HashSet, BTreeMap};
+use std::collections::{HashMap, HashSet, BTreeMap};
 use std::convert::TryInto;
 use std::sync::Arc;
 use std::sync::atomic::{AtomicU64, Ordering};
@ -155,11 +155,12 @@ fn last_update_id(
    Ok(uikey.max(urikey).unwrap_or(0))
 }
-#[derive(Copy, Clone)]
+#[derive(Clone)]
 pub struct IndexStats {
    pub number_of_words: usize,
    pub number_of_documents: u64,
    pub number_attrs_in_ranked_map: usize,
    pub documents_fields_repartition: HashMap<String, u64>,
 }
 #[derive(Clone)]
@ -275,10 +276,12 @@ impl Index {
    pub fn stats(&self) -> RocksDbResult<IndexStats> {
        let cache = self.cache.load();
        let documents_fields_repartition = self.documents_index.documents_fields_repartition(cache.schema.clone())?;
        Ok(IndexStats {
            number_of_words: cache.words.len(),
            number_of_documents: cache.number_of_documents,
            number_attrs_in_ranked_map: cache.ranked_map.len(),
            documents_fields_repartition,
        })
    }
--- a/meilidb-data/tests/updates.rs
+++ b/meilidb-data/tests/updates.rs
@ -1,3 +1,5 @@
 #[macro_use] extern crate maplit;
 use std::sync::atomic::{AtomicBool, Ordering::Relaxed};
 use std::sync::Arc;
@ -94,3 +96,67 @@ fn replace_document() {
    assert_eq!(docs.len(), 1);
    assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2));
 }
 #[test]
 fn database_stats() {
    let tmp_dir = tempfile::tempdir().unwrap();
    let database = Database::open(&tmp_dir).unwrap();
    let as_been_updated = Arc::new(AtomicBool::new(false));
    let schema = simple_schema();
    let index = database.create_index("hello", schema).unwrap();
    let as_been_updated_clone = as_been_updated.clone();
    index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
    let doc1 = json!({ "objectId": 123, "title": "hello" });
    let mut addition = index.documents_addition();
    addition.update_document(&doc1);
    let update_id = addition.finalize().unwrap();
    let status = index.update_status_blocking(update_id).unwrap();
    assert!(as_been_updated.swap(false, Relaxed));
    assert!(status.result.is_ok());
    let stats = index.stats().unwrap();
    let repartition = hashmap!{
        "objectId".to_string() => 1u64,
        "title".to_string() => 1u64,
    };
    assert_eq!(stats.number_of_documents, 1);
    assert_eq!(stats.documents_fields_repartition, repartition);
    let doc2 = json!({ "objectId": 456, "title": "world" });
    let mut addition = index.documents_addition();
    addition.update_document(&doc2);
    let update_id = addition.finalize().unwrap();
    let status = index.update_status_blocking(update_id).unwrap();
    assert!(as_been_updated.swap(false, Relaxed));
    assert!(status.result.is_ok());
    let stats = index.stats().unwrap();
    let repartition = hashmap!{
        "objectId".to_string() => 2u64,
        "title".to_string() => 2u64,
    };
    assert_eq!(stats.number_of_documents, 2);
    assert_eq!(stats.documents_fields_repartition, repartition);
    let doc3 = json!({ "objectId": 789 });
    let mut addition = index.documents_addition();
    addition.update_document(&doc3);
    let update_id = addition.finalize().unwrap();
    let status = index.update_status_blocking(update_id).unwrap();
    assert!(as_been_updated.swap(false, Relaxed));
    assert!(status.result.is_ok());
    let stats = index.stats().unwrap();
    let repartition = hashmap!{
        "objectId".to_string() => 3u64,
        "title".to_string() => 2u64,
    };
    assert_eq!(stats.number_of_documents, 3);
    assert_eq!(stats.documents_fields_repartition, repartition);
 }