240: Field distribution r=Kerollmops a=irevoire

closes #199
closes #198 


Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
bors[bot] 2021-06-19 10:14:25 +00:00 committed by GitHub
commit 5b19dd23d9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 100 additions and 40 deletions

View File

@ -26,7 +26,7 @@ pub mod main_key {
pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key";
pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids";
pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields";
pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution";
pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map";
pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids";
pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids";
@ -290,28 +290,28 @@ impl Index {
.unwrap_or_default()) .unwrap_or_default())
} }
/* fields distribution */ /* field distribution */
/// Writes the fields distribution which associates every field name with /// Writes the field distribution which associates every field name with
/// the number of times it occurs in the documents. /// the number of times it occurs in the documents.
pub(crate) fn put_fields_distribution( pub(crate) fn put_field_distribution(
&self, &self,
wtxn: &mut RwTxn, wtxn: &mut RwTxn,
distribution: &FieldsDistribution, distribution: &FieldsDistribution,
) -> heed::Result<()> { ) -> heed::Result<()> {
self.main.put::<_, Str, SerdeJson<FieldsDistribution>>( self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(
wtxn, wtxn,
main_key::FIELDS_DISTRIBUTION_KEY, main_key::FIELD_DISTRIBUTION_KEY,
distribution, distribution,
) )
} }
/// Returns the fields distribution which associates every field name with /// Returns the field distribution which associates every field name with
/// the number of times it occurs in the documents. /// the number of times it occurs in the documents.
pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> { pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
Ok(self Ok(self
.main .main
.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)? .get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)?
.unwrap_or_default()) .unwrap_or_default())
} }
@ -791,7 +791,7 @@ pub(crate) mod tests {
use std::ops::Deref; use std::ops::Deref;
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use maplit::hashmap; use maplit::btreemap;
use tempfile::TempDir; use tempfile::TempDir;
use crate::update::{IndexDocuments, UpdateFormat}; use crate::update::{IndexDocuments, UpdateFormat};
@ -823,7 +823,7 @@ pub(crate) mod tests {
} }
#[test] #[test]
fn initial_fields_distribution() { fn initial_field_distribution() {
let path = tempfile::tempdir().unwrap(); let path = tempfile::tempdir().unwrap();
let mut options = EnvOpenOptions::new(); let mut options = EnvOpenOptions::new();
options.map_size(10 * 1024 * 1024); // 10 MB options.map_size(10 * 1024 * 1024); // 10 MB
@ -842,14 +842,57 @@ pub(crate) mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let fields_distribution = index.fields_distribution(&rtxn).unwrap(); let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!( assert_eq!(
fields_distribution, field_distribution,
hashmap! { btreemap! {
"id".to_string() => 2, "id".to_string() => 2,
"name".to_string() => 2, "name".to_string() => 2,
"age".to_string() => 1, "age".to_string() => 1,
} }
); );
// we add all the documents a second time. we are supposed to get the same
// field_distribution in the end
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!(
field_distribution,
btreemap! {
"id".to_string() => 2,
"name".to_string() => 2,
"age".to_string() => 1,
}
);
// then we update a document by removing one field and another by adding one field
let content = &br#"[
{ "id": 1, "name": "kevin", "has_dog": true },
{ "id": 2, "name": "bob" }
]"#[..];
let mut wtxn = index.write_txn().unwrap();
let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
builder.update_format(UpdateFormat::Json);
builder.execute(content, |_, _| ()).unwrap();
wtxn.commit().unwrap();
let rtxn = index.read_txn().unwrap();
let field_distribution = index.field_distribution(&rtxn).unwrap();
assert_eq!(
field_distribution,
btreemap! {
"id".to_string() => 2,
"name".to_string() => 2,
"has_dog".to_string() => 1,
}
);
} }
} }

View File

@ -14,7 +14,7 @@ pub mod tree_level;
pub mod update; pub mod update;
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::HashMap; use std::collections::{BTreeMap, HashMap};
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
use std::result::Result as StdResult; use std::result::Result as StdResult;
@ -22,7 +22,9 @@ use fxhash::{FxHasher32, FxHasher64};
use serde_json::{Map, Value}; use serde_json::{Map, Value};
pub use self::criterion::{default_criteria, Criterion}; pub use self::criterion::{default_criteria, Criterion};
pub use self::error::{Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError}; pub use self::error::{
Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError,
};
pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::external_documents_ids::ExternalDocumentsIds;
pub use self::fields_ids_map::FieldsIdsMap; pub use self::fields_ids_map::FieldsIdsMap;
pub use self::heed_codec::{ pub use self::heed_codec::{
@ -48,7 +50,7 @@ pub type Attribute = u32;
pub type DocumentId = u32; pub type DocumentId = u32;
pub type FieldId = u8; pub type FieldId = u8;
pub type Position = u32; pub type Position = u32;
pub type FieldsDistribution = HashMap<String, u64>; pub type FieldsDistribution = BTreeMap<String, u64>;
type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>; type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;

View File

@ -47,7 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; self.index.put_field_distribution(self.wtxn, &FieldsDistribution::default())?;
// We clean all the faceted documents ids. // We clean all the faceted documents ids.
let empty = RoaringBitmap::default(); let empty = RoaringBitmap::default();
@ -113,7 +113,7 @@ mod tests {
assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
assert!(index.documents_ids(&rtxn).unwrap().is_empty()); assert!(index.documents_ids(&rtxn).unwrap().is_empty());
assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty());
assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_docids.is_empty(&rtxn).unwrap());
assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());

View File

@ -1,4 +1,4 @@
use std::collections::hash_map::Entry; use std::collections::btree_map::Entry;
use std::collections::HashMap; use std::collections::HashMap;
use chrono::Utc; use chrono::Utc;
@ -147,7 +147,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
} }
let mut fields_distribution = self.index.fields_distribution(self.wtxn)?; let mut field_distribution = self.index.field_distribution(self.wtxn)?;
// We use pre-calculated number of fields occurrences that needs to be deleted // We use pre-calculated number of fields occurrences that needs to be deleted
// to reflect deleted documents. // to reflect deleted documents.
@ -155,7 +155,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
// Otherwise, insert new number of occurrences (current_count - count_diff). // Otherwise, insert new number of occurrences (current_count - count_diff).
for (field_id, count_diff) in fields_ids_distribution_diff { for (field_id, count_diff) in fields_ids_distribution_diff {
let field_name = fields_ids_map.name(field_id).unwrap(); let field_name = fields_ids_map.name(field_id).unwrap();
if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) {
match entry.get().checked_sub(count_diff) { match entry.get().checked_sub(count_diff) {
Some(0) | None => entry.remove(), Some(0) | None => entry.remove(),
Some(count) => entry.insert(count), Some(count) => entry.insert(count),
@ -163,7 +163,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
} }
} }
self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; self.index.put_field_distribution(self.wtxn, &field_distribution)?;
// We create the FST map of the external ids that we must delete. // We create the FST map of the external ids that we must delete.
external_ids.sort_unstable(); external_ids.sort_unstable();
@ -479,7 +479,7 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); assert!(index.field_distribution(&rtxn).unwrap().is_empty());
} }
#[test] #[test]

View File

@ -378,7 +378,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
let TransformOutput { let TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
fields_distribution, field_distribution,
external_documents_ids, external_documents_ids,
new_documents_ids, new_documents_ids,
replaced_documents_ids, replaced_documents_ids,
@ -594,8 +594,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
// We write the fields ids map into the main database // We write the fields ids map into the main database
self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?;
// We write the fields distribution into the main database // We write the field distribution into the main database
self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; self.index.put_field_distribution(self.wtxn, &field_distribution)?;
// We write the primary key field id into the main database // We write the primary key field id into the main database
self.index.put_primary_key(self.wtxn, &primary_key)?; self.index.put_primary_key(self.wtxn, &primary_key)?;

View File

@ -1,4 +1,5 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::collections::btree_map::Entry;
use std::fs::File; use std::fs::File;
use std::io::{Read, Seek, SeekFrom}; use std::io::{Read, Seek, SeekFrom};
use std::iter::Peekable; use std::iter::Peekable;
@ -25,7 +26,7 @@ const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
pub struct TransformOutput { pub struct TransformOutput {
pub primary_key: String, pub primary_key: String,
pub fields_ids_map: FieldsIdsMap, pub fields_ids_map: FieldsIdsMap,
pub fields_distribution: FieldsDistribution, pub field_distribution: FieldsDistribution,
pub external_documents_ids: ExternalDocumentsIds<'static>, pub external_documents_ids: ExternalDocumentsIds<'static>,
pub new_documents_ids: RoaringBitmap, pub new_documents_ids: RoaringBitmap,
pub replaced_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap,
@ -127,7 +128,7 @@ impl Transform<'_, '_> {
return Ok(TransformOutput { return Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
fields_distribution: self.index.fields_distribution(self.rtxn)?, field_distribution: self.index.field_distribution(self.rtxn)?,
external_documents_ids: ExternalDocumentsIds::default(), external_documents_ids: ExternalDocumentsIds::default(),
new_documents_ids: RoaringBitmap::new(), new_documents_ids: RoaringBitmap::new(),
replaced_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(),
@ -385,7 +386,7 @@ impl Transform<'_, '_> {
Error: From<E>, Error: From<E>,
{ {
let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?;
let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; let mut field_distribution = self.index.field_distribution(self.rtxn)?;
let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
// Once we have sort and deduplicated the documents we write them into a final file. // Once we have sort and deduplicated the documents we write them into a final file.
@ -419,11 +420,6 @@ impl Transform<'_, '_> {
// we use it and insert it in the list of replaced documents. // we use it and insert it in the list of replaced documents.
replaced_documents_ids.insert(docid); replaced_documents_ids.insert(docid);
// Depending on the update indexing method we will merge
// the document update with the current document or not.
match self.index_documents_method {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => {
let key = BEU32::new(docid); let key = BEU32::new(docid);
let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
InternalError::DatabaseMissingEntry { InternalError::DatabaseMissingEntry {
@ -431,6 +427,25 @@ impl Transform<'_, '_> {
key: None, key: None,
}, },
)?; )?;
// we remove all the fields that were already counted
for (field_id, _) in base_obkv.iter() {
let field_name = fields_ids_map.name(field_id).unwrap();
if let Entry::Occupied(mut entry) =
field_distribution.entry(field_name.to_string())
{
match entry.get().checked_sub(1) {
Some(0) | None => entry.remove(),
Some(count) => entry.insert(count),
};
}
}
// Depending on the update indexing method we will merge
// the document update with the current document or not.
match self.index_documents_method {
IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
IndexDocumentsMethod::UpdateDocuments => {
let update_obkv = obkv::KvReader::new(update_obkv); let update_obkv = obkv::KvReader::new(update_obkv);
merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
(docid, obkv_buffer.as_slice()) (docid, obkv_buffer.as_slice())
@ -455,7 +470,7 @@ impl Transform<'_, '_> {
let reader = obkv::KvReader::new(obkv); let reader = obkv::KvReader::new(obkv);
for (field_id, _) in reader.iter() { for (field_id, _) in reader.iter() {
let field_name = fields_ids_map.name(field_id).unwrap(); let field_name = fields_ids_map.name(field_id).unwrap();
*fields_distribution.entry(field_name.to_string()).or_default() += 1; *field_distribution.entry(field_name.to_string()).or_default() += 1;
} }
} }
@ -485,7 +500,7 @@ impl Transform<'_, '_> {
Ok(TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map, fields_ids_map,
fields_distribution, field_distribution,
external_documents_ids: external_documents_ids.into_static(), external_documents_ids: external_documents_ids.into_static(),
new_documents_ids, new_documents_ids,
replaced_documents_ids, replaced_documents_ids,
@ -503,7 +518,7 @@ impl Transform<'_, '_> {
old_fields_ids_map: FieldsIdsMap, old_fields_ids_map: FieldsIdsMap,
new_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap,
) -> Result<TransformOutput> { ) -> Result<TransformOutput> {
let fields_distribution = self.index.fields_distribution(self.rtxn)?; let field_distribution = self.index.field_distribution(self.rtxn)?;
let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?;
let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?;
let documents_count = documents_ids.len() as usize; let documents_count = documents_ids.len() as usize;
@ -540,7 +555,7 @@ impl Transform<'_, '_> {
Ok(TransformOutput { Ok(TransformOutput {
primary_key, primary_key,
fields_ids_map: new_fields_ids_map, fields_ids_map: new_fields_ids_map,
fields_distribution, field_distribution,
external_documents_ids: external_documents_ids.into_static(), external_documents_ids: external_documents_ids.into_static(),
new_documents_ids: documents_ids, new_documents_ids: documents_ids,
replaced_documents_ids: RoaringBitmap::default(), replaced_documents_ids: RoaringBitmap::default(),