diff --git a/milli/src/index.rs b/milli/src/index.rs index a6c09f3d3..2faf8d1f8 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -26,7 +26,7 @@ pub mod main_key { pub const DISTINCT_FIELD_KEY: &str = "distinct-field-key"; pub const DOCUMENTS_IDS_KEY: &str = "documents-ids"; pub const FILTERABLE_FIELDS_KEY: &str = "filterable-fields"; - pub const FIELDS_DISTRIBUTION_KEY: &str = "fields-distribution"; + pub const FIELD_DISTRIBUTION_KEY: &str = "fields-distribution"; pub const FIELDS_IDS_MAP_KEY: &str = "fields-ids-map"; pub const HARD_EXTERNAL_DOCUMENTS_IDS_KEY: &str = "hard-external-documents-ids"; pub const NUMBER_FACETED_DOCUMENTS_IDS_PREFIX: &str = "number-faceted-documents-ids"; @@ -290,28 +290,28 @@ impl Index { .unwrap_or_default()) } - /* fields distribution */ + /* field distribution */ - /// Writes the fields distribution which associates every field name with + /// Writes the field distribution which associates every field name with /// the number of times it occurs in the documents. - pub(crate) fn put_fields_distribution( + pub(crate) fn put_field_distribution( &self, wtxn: &mut RwTxn, distribution: &FieldsDistribution, ) -> heed::Result<()> { self.main.put::<_, Str, SerdeJson>( wtxn, - main_key::FIELDS_DISTRIBUTION_KEY, + main_key::FIELD_DISTRIBUTION_KEY, distribution, ) } - /// Returns the fields distribution which associates every field name with + /// Returns the field distribution which associates every field name with /// the number of times it occurs in the documents. - pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result { + pub fn field_distribution(&self, rtxn: &RoTxn) -> heed::Result { Ok(self .main - .get::<_, Str, SerdeJson>(rtxn, main_key::FIELDS_DISTRIBUTION_KEY)? + .get::<_, Str, SerdeJson>(rtxn, main_key::FIELD_DISTRIBUTION_KEY)? .unwrap_or_default()) } @@ -791,7 +791,7 @@ pub(crate) mod tests { use std::ops::Deref; use heed::EnvOpenOptions; - use maplit::hashmap; + use maplit::btreemap; use tempfile::TempDir; use crate::update::{IndexDocuments, UpdateFormat}; @@ -823,7 +823,7 @@ pub(crate) mod tests { } #[test] - fn initial_fields_distribution() { + fn initial_field_distribution() { let path = tempfile::tempdir().unwrap(); let mut options = EnvOpenOptions::new(); options.map_size(10 * 1024 * 1024); // 10 MB @@ -842,14 +842,57 @@ pub(crate) mod tests { let rtxn = index.read_txn().unwrap(); - let fields_distribution = index.fields_distribution(&rtxn).unwrap(); + let field_distribution = index.field_distribution(&rtxn).unwrap(); assert_eq!( - fields_distribution, - hashmap! { + field_distribution, + btreemap! { "id".to_string() => 2, "name".to_string() => 2, "age".to_string() => 1, } ); + + // we add all the documents a second time. we are supposed to get the same + // field_distribution in the end + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let field_distribution = index.field_distribution(&rtxn).unwrap(); + assert_eq!( + field_distribution, + btreemap! { + "id".to_string() => 2, + "name".to_string() => 2, + "age".to_string() => 1, + } + ); + + // then we update a document by removing one field and another by adding one field + let content = &br#"[ + { "id": 1, "name": "kevin", "has_dog": true }, + { "id": 2, "name": "bob" } + ]"#[..]; + let mut wtxn = index.write_txn().unwrap(); + let mut builder = IndexDocuments::new(&mut wtxn, &index, 0); + builder.update_format(UpdateFormat::Json); + builder.execute(content, |_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let field_distribution = index.field_distribution(&rtxn).unwrap(); + assert_eq!( + field_distribution, + btreemap! { + "id".to_string() => 2, + "name".to_string() => 2, + "has_dog".to_string() => 1, + } + ); } } diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 7571d8a53..e88ac62d5 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -14,7 +14,7 @@ pub mod tree_level; pub mod update; use std::borrow::Cow; -use std::collections::HashMap; +use std::collections::{BTreeMap, HashMap}; use std::hash::BuildHasherDefault; use std::result::Result as StdResult; @@ -22,7 +22,9 @@ use fxhash::{FxHasher32, FxHasher64}; use serde_json::{Map, Value}; pub use self::criterion::{default_criteria, Criterion}; -pub use self::error::{Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError}; +pub use self::error::{ + Error, FieldIdMapMissingEntry, InternalError, SerializationError, UserError, +}; pub use self::external_documents_ids::ExternalDocumentsIds; pub use self::fields_ids_map::FieldsIdsMap; pub use self::heed_codec::{ @@ -48,7 +50,7 @@ pub type Attribute = u32; pub type DocumentId = u32; pub type FieldId = u8; pub type Position = u32; -pub type FieldsDistribution = HashMap; +pub type FieldsDistribution = BTreeMap; type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult, E>; diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index 42dd55443..dbb932bfe 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -47,7 +47,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> { self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?; self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?; self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?; - self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?; + self.index.put_field_distribution(self.wtxn, &FieldsDistribution::default())?; // We clean all the faceted documents ids. let empty = RoaringBitmap::default(); @@ -113,7 +113,7 @@ mod tests { assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty()); assert!(index.external_documents_ids(&rtxn).unwrap().is_empty()); assert!(index.documents_ids(&rtxn).unwrap().is_empty()); - assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); assert!(index.word_docids.is_empty(&rtxn).unwrap()); assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap()); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index dfb48dc58..e291eb106 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -1,4 +1,4 @@ -use std::collections::hash_map::Entry; +use std::collections::btree_map::Entry; use std::collections::HashMap; use chrono::Utc; @@ -147,7 +147,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - let mut fields_distribution = self.index.fields_distribution(self.wtxn)?; + let mut field_distribution = self.index.field_distribution(self.wtxn)?; // We use pre-calculated number of fields occurrences that needs to be deleted // to reflect deleted documents. @@ -155,7 +155,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { // Otherwise, insert new number of occurrences (current_count - count_diff). for (field_id, count_diff) in fields_ids_distribution_diff { let field_name = fields_ids_map.name(field_id).unwrap(); - if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) { + if let Entry::Occupied(mut entry) = field_distribution.entry(field_name.to_string()) { match entry.get().checked_sub(count_diff) { Some(0) | None => entry.remove(), Some(count) => entry.insert(count), @@ -163,7 +163,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } - self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; + self.index.put_field_distribution(self.wtxn, &field_distribution)?; // We create the FST map of the external ids that we must delete. external_ids.sort_unstable(); @@ -479,7 +479,7 @@ mod tests { let rtxn = index.read_txn().unwrap(); - assert!(index.fields_distribution(&rtxn).unwrap().is_empty()); + assert!(index.field_distribution(&rtxn).unwrap().is_empty()); } #[test] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 05242f540..a25b0f3a7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -378,7 +378,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { let TransformOutput { primary_key, fields_ids_map, - fields_distribution, + field_distribution, external_documents_ids, new_documents_ids, replaced_documents_ids, @@ -594,8 +594,8 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { // We write the fields ids map into the main database self.index.put_fields_ids_map(self.wtxn, &fields_ids_map)?; - // We write the fields distribution into the main database - self.index.put_fields_distribution(self.wtxn, &fields_distribution)?; + // We write the field distribution into the main database + self.index.put_field_distribution(self.wtxn, &field_distribution)?; // We write the primary key field id into the main database self.index.put_primary_key(self.wtxn, &primary_key)?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 756ff492e..074d281ba 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -1,4 +1,5 @@ use std::borrow::Cow; +use std::collections::btree_map::Entry; use std::fs::File; use std::io::{Read, Seek, SeekFrom}; use std::iter::Peekable; @@ -25,7 +26,7 @@ const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, - pub fields_distribution: FieldsDistribution, + pub field_distribution: FieldsDistribution, pub external_documents_ids: ExternalDocumentsIds<'static>, pub new_documents_ids: RoaringBitmap, pub replaced_documents_ids: RoaringBitmap, @@ -127,7 +128,7 @@ impl Transform<'_, '_> { return Ok(TransformOutput { primary_key, fields_ids_map, - fields_distribution: self.index.fields_distribution(self.rtxn)?, + field_distribution: self.index.field_distribution(self.rtxn)?, external_documents_ids: ExternalDocumentsIds::default(), new_documents_ids: RoaringBitmap::new(), replaced_documents_ids: RoaringBitmap::new(), @@ -385,7 +386,7 @@ impl Transform<'_, '_> { Error: From, { let documents_ids = self.index.documents_ids(self.rtxn)?; - let mut fields_distribution = self.index.fields_distribution(self.rtxn)?; + let mut field_distribution = self.index.field_distribution(self.rtxn)?; let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids); // Once we have sort and deduplicated the documents we write them into a final file. @@ -419,18 +420,32 @@ impl Transform<'_, '_> { // we use it and insert it in the list of replaced documents. replaced_documents_ids.insert(docid); + let key = BEU32::new(docid); + let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( + InternalError::DatabaseMissingEntry { + db_name: db_name::DOCUMENTS, + key: None, + }, + )?; + + // we remove all the fields that were already counted + for (field_id, _) in base_obkv.iter() { + let field_name = fields_ids_map.name(field_id).unwrap(); + if let Entry::Occupied(mut entry) = + field_distribution.entry(field_name.to_string()) + { + match entry.get().checked_sub(1) { + Some(0) | None => entry.remove(), + Some(count) => entry.insert(count), + }; + } + } + // Depending on the update indexing method we will merge // the document update with the current document or not. match self.index_documents_method { IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv), IndexDocumentsMethod::UpdateDocuments => { - let key = BEU32::new(docid); - let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or( - InternalError::DatabaseMissingEntry { - db_name: db_name::DOCUMENTS, - key: None, - }, - )?; let update_obkv = obkv::KvReader::new(update_obkv); merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer); (docid, obkv_buffer.as_slice()) @@ -455,7 +470,7 @@ impl Transform<'_, '_> { let reader = obkv::KvReader::new(obkv); for (field_id, _) in reader.iter() { let field_name = fields_ids_map.name(field_id).unwrap(); - *fields_distribution.entry(field_name.to_string()).or_default() += 1; + *field_distribution.entry(field_name.to_string()).or_default() += 1; } } @@ -485,7 +500,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map, - fields_distribution, + field_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids, replaced_documents_ids, @@ -503,7 +518,7 @@ impl Transform<'_, '_> { old_fields_ids_map: FieldsIdsMap, new_fields_ids_map: FieldsIdsMap, ) -> Result { - let fields_distribution = self.index.fields_distribution(self.rtxn)?; + let field_distribution = self.index.field_distribution(self.rtxn)?; let external_documents_ids = self.index.external_documents_ids(self.rtxn)?; let documents_ids = self.index.documents_ids(self.rtxn)?; let documents_count = documents_ids.len() as usize; @@ -540,7 +555,7 @@ impl Transform<'_, '_> { Ok(TransformOutput { primary_key, fields_ids_map: new_fields_ids_map, - fields_distribution, + field_distribution, external_documents_ids: external_documents_ids.into_static(), new_documents_ids: documents_ids, replaced_documents_ids: RoaringBitmap::default(),