From 9205b640a484a41cfe79ff6b7410dd08773f3090 Mon Sep 17 00:00:00 2001 From: Alexey Shekhirin Date: Wed, 31 Mar 2021 18:14:23 +0300 Subject: [PATCH] feat(index): introduce fields_ids_distribution --- milli/src/index.rs | 60 +++++++++++++++++++++++++ milli/src/update/index_documents/mod.rs | 2 +- 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index a14747788..2e0d329ef 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -203,6 +203,25 @@ impl Index { Ok(self.main.get::<_, Str, SerdeJson>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default()) } + /* fields ids distribution */ + + /// Returns the fields ids distribution which associate the internal field ids + /// with the number of times it occurs in the obkv documents. + // TODO store in the index itself and change only within updates that modify the documents + pub fn fields_ids_distribution(&self, rtxn: &RoTxn) -> anyhow::Result> { + let mut distribution = HashMap::new(); + + for document in self.documents.iter(rtxn)? { + let (_, obkv) = document?; + + for (field_id, _) in obkv.iter() { + *distribution.entry(field_id).or_default() += 1; + } + } + + Ok(distribution) + } + /* displayed fields */ /// Writes the fields that must be displayed in the defined order. @@ -429,3 +448,44 @@ impl Index { self.main.put::<_, Str, SerdeJson>>(wtxn, UPDATED_AT_KEY, &time) } } + +#[cfg(test)] +mod tests { + use heed::EnvOpenOptions; + + use crate::Index; + use crate::update::{IndexDocuments, UpdateFormat}; + + fn prepare_index() -> Index { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + let mut wtxn = index.write_txn().unwrap(); + let content = &br#" + { "name": "kevin" } + { "name": "bob", "age": 20 } + "#[..]; + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::JsonStream); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + index + } + + #[test] + fn fields_ids_distribution() { + let index = prepare_index(); + + let rtxn = index.read_txn().unwrap(); + + let fields_ids_map = index.fields_ids_map(&rtxn).unwrap(); + + let fields_ids_distribution = index.fields_ids_distribution(&rtxn).unwrap(); + assert_eq!(fields_ids_distribution.len(), 2); + assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("age").unwrap()), Some(&1)); + assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("name").unwrap()), Some(&2)); + } +} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ccbd95c7f..a19d8c0a7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -925,7 +925,7 @@ mod tests { // one sent and that an UUID has been generated. assert_eq!(doc.get(0), Some(&br#""updated kevin""#[..])); // This is an UUID, it must be 36 bytes long plus the 2 surrounding string quotes ("). - assert!(doc.get(1).unwrap().len() == 36 + 2); + assert_eq!(doc.get(1).unwrap().len(), 36 + 2); drop(rtxn); }