feat(index): store fields distribution in index

This commit is contained in:
Alexey Shekhirin 2021-03-31 18:14:23 +03:00
parent 67e25f8724
commit 27c7ab6e00
No known key found for this signature in database
GPG key ID: AF9A26AA133B5B98
5 changed files with 45 additions and 26 deletions

View file

@ -23,6 +23,7 @@ pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const FACETED_DOCUMENTS_IDS_PREFIX: &str = "faceted-documents-ids";
pub const FACETED_FIELDS_KEY: &str = "faceted-fields";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution";
pub const PRIMARY_KEY_KEY: &str = "primary-key";
pub const SEARCHABLE_FIELDS_KEY: &str = "searchable-fields";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
@ -33,6 +34,8 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
const CREATED_AT_KEY: &str = "created-at";
const UPDATED_AT_KEY: &str = "updated-at";
pub type FieldsDistribution = HashMap<String, u64>;
#[derive(Clone)]
pub struct Index {
/// The LMDB environment which this index is associated with.
@ -204,23 +207,18 @@ impl Index {
Ok(self.main.get::<_, Str, SerdeJson<FieldsIdsMap>>(rtxn, FIELDS_IDS_MAP_KEY)?.unwrap_or_default())
}
/* fields ids distribution */
/* fields distribution */
/// Returns the fields ids distribution which associate the internal field ids
/// with the number of times it occurs in the obkv documents.
// TODO store in the index itself and change only within updates that modify the documents
pub fn fields_ids_distribution(&self, rtxn: &RoTxn) -> anyhow::Result<HashMap<FieldId, u64>> {
let mut distribution = HashMap::new();
/// Writes the fields distribution which associate the field with the number of times
/// it occurs in the obkv documents.
pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution)
}
for document in self.documents.iter(rtxn)? {
let (_, obkv) = document?;
for (field_id, _) in obkv.iter() {
*distribution.entry(field_id).or_default() += 1;
}
}
Ok(distribution)
/// Returns the fields distribution which associate the field with the number of times
/// it occurs in the obkv documents.
pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default())
}
/* displayed fields */
@ -469,6 +467,7 @@ impl Index {
#[cfg(test)]
mod tests {
use heed::EnvOpenOptions;
use maplit::hashmap;
use crate::Index;
use crate::update::{IndexDocuments, UpdateFormat};
@ -493,16 +492,15 @@ mod tests {
}
#[test]
fn fields_ids_distribution() {
fn initial_fields_distribution() {
let index = prepare_index();
let rtxn = index.read_txn().unwrap();
let fields_ids_map = index.fields_ids_map(&rtxn).unwrap();
let fields_ids_distribution = index.fields_ids_distribution(&rtxn).unwrap();
assert_eq!(fields_ids_distribution.len(), 2);
assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("age").unwrap()), Some(&1));
assert_eq!(fields_ids_distribution.get(&fields_ids_map.id("name").unwrap()), Some(&2));
let fields_distribution = index.fields_distribution(&rtxn).unwrap();
assert_eq!(fields_distribution, hashmap!{
"age".to_string() => 1,
"name".to_string() => 2
});
}
}