convert the field_distribution to a BTreeMap and avoid counting twice the same documents

2025-06-25 16:08:29 +02:00 · 2021-06-17 17:05:34 +02:00 · 2021-06-17 17:05:34 +02:00 · d08cfda796
commit d08cfda796
parent 969adaefdf
4 changed files with 70 additions and 12 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -791,7 +791,7 @@ pub(crate) mod tests {
    use std::ops::Deref;
    use heed::EnvOpenOptions;
-    use maplit::hashmap;
+    use maplit::btreemap;
    use tempfile::TempDir;
    use crate::update::{IndexDocuments, UpdateFormat};
@ -845,11 +845,54 @@ pub(crate) mod tests {
        let field_distribution = index.field_distribution(&rtxn).unwrap();
        assert_eq!(
            field_distribution,
-            hashmap! {
+            btreemap! {
                "id".to_string() => 2,
                "name".to_string() => 2,
                "age".to_string() => 1,
            }
        );
        // we add all the documents a second time. we are supposed to get the same
        // field_distribution in the end
        let mut wtxn = index.write_txn().unwrap();
        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
        builder.update_format(UpdateFormat::Json);
        builder.execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();
        let rtxn = index.read_txn().unwrap();
        let field_distribution = index.field_distribution(&rtxn).unwrap();
        assert_eq!(
            field_distribution,
            btreemap! {
                "id".to_string() => 2,
                "name".to_string() => 2,
                "age".to_string() => 1,
            }
        );
        // then we update a document by removing one field and another by adding one field
        let content = &br#"[
            { "id": 1, "name": "kevin", "has_dog": true },
            { "id": 2, "name": "bob" }
        ]"#[..];
        let mut wtxn = index.write_txn().unwrap();
        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
        builder.update_format(UpdateFormat::Json);
        builder.execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();
        let rtxn = index.read_txn().unwrap();
        let field_distribution = index.field_distribution(&rtxn).unwrap();
        assert_eq!(
            field_distribution,
            btreemap! {
                "id".to_string() => 2,
                "name".to_string() => 2,
                "has_dog".to_string() => 1,
            }
        );
    }
 }
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -14,7 +14,7 @@ pub mod tree_level;
 pub mod update;
 use std::borrow::Cow;
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::hash::BuildHasherDefault;
 use std::result::Result as StdResult;
@ -50,7 +50,7 @@ pub type Attribute = u32;
 pub type DocumentId = u32;
 pub type FieldId = u8;
 pub type Position = u32;
-pub type FieldsDistribution = HashMap<String, u64>;
+pub type FieldsDistribution = BTreeMap<String, u64>;
 type MergeFn<E> = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> StdResult<Vec<u8>, E>;
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -1,4 +1,4 @@
-use std::collections::hash_map::Entry;
+use std::collections::btree_map::Entry;
 use std::collections::HashMap;
 use chrono::Utc;
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
 use std::collections::btree_map::Entry;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
 use std::iter::Peekable;
@ -419,11 +420,6 @@ impl Transform<'_, '_> {
                    // we use it and insert it in the list of replaced documents.
                    replaced_documents_ids.insert(docid);
                    // Depending on the update indexing method we will merge
                    // the document update with the current document or not.
                    match self.index_documents_method {
                        IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
                        IndexDocumentsMethod::UpdateDocuments => {
                    let key = BEU32::new(docid);
                    let base_obkv = self.index.documents.get(&self.rtxn, &key)?.ok_or(
                        InternalError::DatabaseMissingEntry {
@ -431,6 +427,25 @@ impl Transform<'_, '_> {
                            key: None,
                        },
                    )?;
                    // we remove all the fields that were already counted
                    for (field_id, _) in base_obkv.iter() {
                        let field_name = fields_ids_map.name(field_id).unwrap();
                        if let Entry::Occupied(mut entry) =
                            field_distribution.entry(field_name.to_string())
                        {
                            match entry.get().checked_sub(1) {
                                Some(0) | None => entry.remove(),
                                Some(count) => entry.insert(count),
                            };
                        }
                    }
                    // Depending on the update indexing method we will merge
                    // the document update with the current document or not.
                    match self.index_documents_method {
                        IndexDocumentsMethod::ReplaceDocuments => (docid, update_obkv),
                        IndexDocumentsMethod::UpdateDocuments => {
                            let update_obkv = obkv::KvReader::new(update_obkv);
                            merge_two_obkvs(base_obkv, update_obkv, &mut obkv_buffer);
                            (docid, obkv_buffer.as_slice())