Merge pull request #189 from meilisearch/documents-fields-repartition

Add the documents fields repartition into stats
This commit is contained in:
Clément Renault 2019-09-17 16:23:49 +02:00 committed by GitHub
commit c10b701b9a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 105 additions and 3 deletions

View File

@ -37,3 +37,4 @@ branch = "arc-byte-slice"
[dev-dependencies] [dev-dependencies]
tempfile = "3.1.0" tempfile = "3.1.0"
maplit = "1.0.2"

View File

@ -1,7 +1,8 @@
use std::convert::TryInto; use std::convert::TryInto;
use std::collections::HashMap;
use meilidb_core::DocumentId; use meilidb_core::DocumentId;
use meilidb_schema::SchemaAttr; use meilidb_schema::{Schema, SchemaAttr};
use rocksdb::DBVector; use rocksdb::DBVector;
use crate::document_attr_key::DocumentAttrKey; use crate::document_attr_key::DocumentAttrKey;
@ -54,6 +55,20 @@ impl DocumentsIndex {
Ok(DocumentFieldsIter(iter)) Ok(DocumentFieldsIter(iter))
} }
pub fn documents_fields_repartition(&self, schema: Schema) -> RocksDbResult<HashMap<String, u64>> {
let iter = self.0.iter()?;
let mut repartition_attributes_id = HashMap::new();
for key in DocumentsKeysIter(iter) {
let counter = repartition_attributes_id.entry(key.attribute).or_insert(0);
*counter += 1u64;
}
let mut repartition_with_attribute_name = HashMap::new();
for (key, val) in repartition_attributes_id {
repartition_with_attribute_name.insert(schema.attribute_name(key).to_owned(), val);
}
Ok(repartition_with_attribute_name)
}
pub fn len(&self) -> RocksDbResult<u64> { pub fn len(&self) -> RocksDbResult<u64> {
let mut last_document_id = None; let mut last_document_id = None;
let mut count = 0; let mut count = 0;
@ -88,3 +103,20 @@ impl Iterator for DocumentFieldsIter<'_> {
} }
} }
} }
pub struct DocumentsKeysIter<'a>(crate::CfIter<'a>);
impl Iterator for DocumentsKeysIter<'_> {
type Item = DocumentAttrKey;
fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some((key, _)) => {
let array = key.as_ref().try_into().unwrap();
let key = DocumentAttrKey::from_be_bytes(array);
Some(key)
},
None => None,
}
}
}

View File

@ -1,4 +1,4 @@
use std::collections::{HashSet, BTreeMap}; use std::collections::{HashMap, HashSet, BTreeMap};
use std::convert::TryInto; use std::convert::TryInto;
use std::sync::Arc; use std::sync::Arc;
use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::atomic::{AtomicU64, Ordering};
@ -155,11 +155,12 @@ fn last_update_id(
Ok(uikey.max(urikey).unwrap_or(0)) Ok(uikey.max(urikey).unwrap_or(0))
} }
#[derive(Copy, Clone)] #[derive(Clone)]
pub struct IndexStats { pub struct IndexStats {
pub number_of_words: usize, pub number_of_words: usize,
pub number_of_documents: u64, pub number_of_documents: u64,
pub number_attrs_in_ranked_map: usize, pub number_attrs_in_ranked_map: usize,
pub documents_fields_repartition: HashMap<String, u64>,
} }
#[derive(Clone)] #[derive(Clone)]
@ -275,10 +276,12 @@ impl Index {
pub fn stats(&self) -> RocksDbResult<IndexStats> { pub fn stats(&self) -> RocksDbResult<IndexStats> {
let cache = self.cache.load(); let cache = self.cache.load();
let documents_fields_repartition = self.documents_index.documents_fields_repartition(cache.schema.clone())?;
Ok(IndexStats { Ok(IndexStats {
number_of_words: cache.words.len(), number_of_words: cache.words.len(),
number_of_documents: cache.number_of_documents, number_of_documents: cache.number_of_documents,
number_attrs_in_ranked_map: cache.ranked_map.len(), number_attrs_in_ranked_map: cache.ranked_map.len(),
documents_fields_repartition,
}) })
} }

View File

@ -1,3 +1,5 @@
#[macro_use] extern crate maplit;
use std::sync::atomic::{AtomicBool, Ordering::Relaxed}; use std::sync::atomic::{AtomicBool, Ordering::Relaxed};
use std::sync::Arc; use std::sync::Arc;
@ -94,3 +96,67 @@ fn replace_document() {
assert_eq!(docs.len(), 1); assert_eq!(docs.len(), 1);
assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2)); assert_eq!(index.document(None, docs[0].id).unwrap().as_ref(), Some(&doc2));
} }
#[test]
fn database_stats() {
let tmp_dir = tempfile::tempdir().unwrap();
let database = Database::open(&tmp_dir).unwrap();
let as_been_updated = Arc::new(AtomicBool::new(false));
let schema = simple_schema();
let index = database.create_index("hello", schema).unwrap();
let as_been_updated_clone = as_been_updated.clone();
index.set_update_callback(move |_| as_been_updated_clone.store(true, Relaxed));
let doc1 = json!({ "objectId": 123, "title": "hello" });
let mut addition = index.documents_addition();
addition.update_document(&doc1);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
let stats = index.stats().unwrap();
let repartition = hashmap!{
"objectId".to_string() => 1u64,
"title".to_string() => 1u64,
};
assert_eq!(stats.number_of_documents, 1);
assert_eq!(stats.documents_fields_repartition, repartition);
let doc2 = json!({ "objectId": 456, "title": "world" });
let mut addition = index.documents_addition();
addition.update_document(&doc2);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
let stats = index.stats().unwrap();
let repartition = hashmap!{
"objectId".to_string() => 2u64,
"title".to_string() => 2u64,
};
assert_eq!(stats.number_of_documents, 2);
assert_eq!(stats.documents_fields_repartition, repartition);
let doc3 = json!({ "objectId": 789 });
let mut addition = index.documents_addition();
addition.update_document(&doc3);
let update_id = addition.finalize().unwrap();
let status = index.update_status_blocking(update_id).unwrap();
assert!(as_been_updated.swap(false, Relaxed));
assert!(status.result.is_ok());
let stats = index.stats().unwrap();
let repartition = hashmap!{
"objectId".to_string() => 3u64,
"title".to_string() => 2u64,
};
assert_eq!(stats.number_of_documents, 3);
assert_eq!(stats.documents_fields_repartition, repartition);
}