feat(index): update fields distribution in clear & delete operations

fixes after review bump the version of the tokenizer implement a first version of the stop_words The front must provide a BTreeSet containing the stop words The stop_words are set at None if an empty Set is provided add the stop-words in the http-ui interface Use maplit in the test and remove all the useless drop(rtxn) at the end of all tests Integrate the stop_words in the querytree remove the stop_words from the querytree except if it was a prefix or a typo more fixes after review
2025-07-01 02:48:31 +02:00 · 2021-04-01 10:07:16 +03:00 · 2021-04-01 10:07:16 +03:00 · 2658c5c545
commit 2658c5c545
parent 27c7ab6e00
7 changed files with 128 additions and 34 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -1520,8 +1520,7 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e"
 [[package]]
 name = "pest"
 version = "2.1.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
+source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
 dependencies = [
 "ucd-trie",
 ]
@ -1529,7 +1528,8 @@ dependencies = [
 [[package]]
 name = "pest"
 version = "2.1.3"
-source = "git+https://github.com/pest-parser/pest.git?rev=51fd1d49f1041f7839975664ef71fe15c7dcaf67#51fd1d49f1041f7839975664ef71fe15c7dcaf67"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "10f4872ae94d7b90ae48754df22fd42ad52ce740b8f370b03da4835417403e53"
 dependencies = [
 "ucd-trie",
 ]
--- a/milli/src/external_documents_ids.rs
+++ b/milli/src/external_documents_ids.rs
@ -19,6 +19,11 @@ impl<'a> ExternalDocumentsIds<'a> {
        }
    }

+    /// Returns `true` if hard and soft external documents lists are empty.
+    pub fn is_empty(&self) -> bool {
+        self.hard.is_empty() && self.soft.is_empty()
+    }
+
    pub fn get<A: AsRef<[u8]>>(&self, external_id: A) -> Option<u32> {
        let external_id = external_id.as_ref();
        match self.soft.get(external_id).or_else(|| self.hard.get(external_id)) {
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -10,7 +10,7 @@ use chrono::{Utc, DateTime};

 use crate::facet::FacetType;
 use crate::fields_ids_map::FieldsIdsMap;
-use crate::{default_criteria, Criterion, Search, FacetDistribution};
+use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution};
 use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
 use crate::{
    RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec,
@ -34,8 +34,6 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
 const CREATED_AT_KEY: &str = "created-at";
 const UPDATED_AT_KEY: &str = "updated-at";

-pub type FieldsDistribution = HashMap<String, u64>;
-
 #[derive(Clone)]
 pub struct Index {
    /// The LMDB environment which this index is associated with.
@ -209,14 +207,14 @@ impl Index {

    /* fields distribution */

-    /// Writes the fields distribution which associate the field with the number of times
-    /// it occurs in the obkv documents.
+    /// Writes the fields distribution which associates every field name with
+    /// the number of times it occurs in the documents.
    pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> {
-        self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution)
+        self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, distribution)
    }

-    /// Returns the fields distribution which associate the field with the number of times
-    /// it occurs in the obkv documents.
+    /// Returns the fields distribution which associates every field name with
+    /// the number of times it occurs in the documents.
    pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
        Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default())
    }
@ -472,35 +470,29 @@ mod tests {
    use crate::Index;
    use crate::update::{IndexDocuments, UpdateFormat};

-    fn prepare_index() -> Index {
+    #[test]
+    fn initial_fields_distribution() {
        let path = tempfile::tempdir().unwrap();
        let mut options = EnvOpenOptions::new();
        options.map_size(10 * 1024 * 1024); // 10 MB
        let index = Index::new(options, &path).unwrap();

        let mut wtxn = index.write_txn().unwrap();
-        let content = &br#"
-        { "name": "kevin" }
-        { "name": "bob", "age": 20 }
-        "#[..];
+        let content = &br#"[
+            { "name": "kevin" },
+            { "name": "bob", "age": 20 }
+        ]"#[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
-        builder.update_format(UpdateFormat::JsonStream);
+        builder.update_format(UpdateFormat::Json);
        builder.execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();

-        index
-    }
-
-    #[test]
-    fn initial_fields_distribution() {
-        let index = prepare_index();
-
        let rtxn = index.read_txn().unwrap();

        let fields_distribution = index.fields_distribution(&rtxn).unwrap();
        assert_eq!(fields_distribution, hashmap!{
+            "name".to_string() => 2,
            "age".to_string() => 1,
-            "name".to_string() => 2
        });
    }
 }
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -41,6 +41,7 @@ pub type Attribute = u32;
 pub type DocumentId = u32;
 pub type FieldId = u8;
 pub type Position = u32;
+pub type FieldsDistribution = HashMap<String, u64>;

 type MergeFn = for<'a> fn(&[u8], &[Cow<'a, [u8]>]) -> anyhow::Result<Vec<u8>>;

--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -1,6 +1,6 @@
 use chrono::Utc;
 use roaring::RoaringBitmap;
-use crate::{ExternalDocumentsIds, Index};
+use crate::{ExternalDocumentsIds, Index, FieldsDistribution};

 pub struct ClearDocuments<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
@ -42,6 +42,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        self.index.put_words_prefixes_fst(self.wtxn, &fst::Set::default())?;
        self.index.put_external_documents_ids(self.wtxn, &ExternalDocumentsIds::default())?;
        self.index.put_documents_ids(self.wtxn, &RoaringBitmap::default())?;
+        self.index.put_fields_distribution(self.wtxn, &FieldsDistribution::default())?;

        // We clean all the faceted documents ids.
        for (field_id, _) in faceted_fields {
@ -61,3 +62,54 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
        Ok(number_of_documents)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use heed::EnvOpenOptions;
+
+    use crate::update::{IndexDocuments, UpdateFormat};
+    use super::*;
+
+    #[test]
+    fn clear_documents() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        let mut wtxn = index.write_txn().unwrap();
+        let content = &br#"[
+            { "id": 0, "name": "kevin", "age": 20 },
+            { "id": 1, "name": "kevina" },
+            { "id": 2, "name": "benoit", "country": "France" }
+        ]"#[..];
+        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
+        builder.update_format(UpdateFormat::Json);
+        builder.execute(content, |_, _| ()).unwrap();
+
+        // Clear all documents from the database.
+        let builder = ClearDocuments::new(&mut wtxn, &index, 1);
+        assert_eq!(builder.execute().unwrap(), 3);
+
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        assert_eq!(index.fields_ids_map(&rtxn).unwrap().len(), 4);
+
+        assert!(index.words_fst(&rtxn).unwrap().is_empty());
+        assert!(index.words_prefixes_fst(&rtxn).unwrap().is_empty());
+        assert!(index.external_documents_ids(&rtxn).unwrap().is_empty());
+        assert!(index.documents_ids(&rtxn).unwrap().is_empty());
+        assert!(index.fields_distribution(&rtxn).unwrap().is_empty());
+
+        assert!(index.word_docids.is_empty(&rtxn).unwrap());
+        assert!(index.word_prefix_docids.is_empty(&rtxn).unwrap());
+        assert!(index.docid_word_positions.is_empty(&rtxn).unwrap());
+        assert!(index.word_pair_proximity_docids.is_empty(&rtxn).unwrap());
+        assert!(index.word_prefix_pair_proximity_docids.is_empty(&rtxn).unwrap());
+        assert!(index.facet_field_id_value_docids.is_empty(&rtxn).unwrap());
+        assert!(index.field_id_docid_facet_values.is_empty(&rtxn).unwrap());
+        assert!(index.documents.is_empty(&rtxn).unwrap());
+    }
+}
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@ -1,3 +1,6 @@
+use std::collections::HashMap;
+use std::collections::hash_map::Entry;
+
 use anyhow::anyhow;
 use chrono::Utc;
 use fst::IntoStreamer;
@ -90,6 +93,9 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            documents,
        } = self.index;

+        // Number of fields for each document that has been deleted.
+        let mut fields_ids_distribution_diff = HashMap::new();
+
        // Retrieve the words and the external documents ids contained in the documents.
        let mut words = Vec::new();
        let mut external_ids = Vec::new();
@ -100,6 +106,10 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            let key = BEU32::new(docid);
            let mut iter = documents.range_mut(self.wtxn, &(key..=key))?;
            if let Some((_key, obkv)) = iter.next().transpose()? {
+                for (field_id, _) in obkv.iter() {
+                    *fields_ids_distribution_diff.entry(field_id).or_default() += 1;
+                }
+
                if let Some(content) = obkv.get(id_field) {
                    let external_id = match serde_json::from_slice(content).unwrap() {
                        Value::String(string) => SmallString32::from(string.as_str()),
@ -112,7 +122,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            }
            drop(iter);

-            // We iterate througt the words positions of the document id,
+            // We iterate through the words positions of the document id,
            // retrieve the word and delete the positions.
            let mut iter = docid_word_positions.prefix_iter_mut(self.wtxn, &(docid, ""))?;
            while let Some(result) = iter.next() {
@ -123,6 +133,24 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
            }
        }

+        let mut fields_distribution = self.index.fields_distribution(self.wtxn)?;
+
+        // We use pre-calculated number of fields occurrences that needs to be deleted
+        // to reflect deleted documents.
+        // If all field occurrences are removed, delete the entry from distribution.
+        // Otherwise, insert new number of occurrences (current_count - count_diff).
+        for (field_id, count_diff) in fields_ids_distribution_diff {
+            let field_name = fields_ids_map.name(field_id).unwrap();
+            if let Entry::Occupied(mut entry) = fields_distribution.entry(field_name.to_string()) {
+                match entry.get().checked_sub(count_diff) {
+                    Some(0) | None => entry.remove(),
+                    Some(count) => entry.insert(count)
+                };
+            }
+        }
+
+        self.index.put_fields_distribution(self.wtxn, &fields_distribution)?;
+
        // We create the FST map of the external ids that we must delete.
        external_ids.sort_unstable();
        let external_ids_to_delete = fst::Set::from_iter(external_ids.iter().map(AsRef::as_ref))?;
@ -347,5 +375,9 @@ mod tests {
        builder.execute().unwrap();

        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+
+        assert!(index.fields_distribution(&rtxn).unwrap().is_empty());
    }
 }
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::collections::HashMap;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
 use std::iter::Peekable;
@ -10,11 +11,10 @@ use log::info;
 use roaring::RoaringBitmap;
 use serde_json::{Map, Value};

-use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId};
+use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
 use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use super::merge_function::merge_two_obkvs;
 use super::{create_writer, create_sorter, IndexDocumentsMethod};
-use crate::index::FieldsDistribution;

 const DEFAULT_PRIMARY_KEY_NAME: &str = "id";

@ -137,6 +137,8 @@ impl Transform<'_, '_> {
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
        let mut documents_count = 0;

+        let mut fields_ids_distribution = HashMap::new();
+
        for result in documents {
            let document = result?;

@ -151,9 +153,9 @@ impl Transform<'_, '_> {

            // We prepare the fields ids map with the documents keys.
            for (key, _value) in &document {
-                fields_ids_map.insert(&key).context("field id limit reached")?;
+                let field_id = fields_ids_map.insert(&key).context("field id limit reached")?;

-                *fields_distribution.entry(key.to_owned()).or_default() += 1;
+                *fields_ids_distribution.entry(field_id).or_insert(0) += 1;
            }

            // We retrieve the user id from the document based on the primary key name,
@ -196,6 +198,11 @@ impl Transform<'_, '_> {
            documents_count += 1;
        }

+        for (field_id, count) in fields_ids_distribution {
+            let field_name = fields_ids_map.name(field_id).unwrap();
+            *fields_distribution.entry(field_name.to_string()).or_default() += count;
+        }
+
        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
            documents_seen: documents_count,
        });
@ -277,6 +284,8 @@ impl Transform<'_, '_> {
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
        let mut documents_count = 0;

+        let mut fields_ids_distribution = HashMap::new();
+
        let mut record = csv::StringRecord::new();
        while csv.read_record(&mut record)? {
            obkv_buffer.clear();
@ -316,9 +325,7 @@ impl Transform<'_, '_> {
                serde_json::to_writer(&mut json_buffer, &field)?;
                writer.insert(*field_id, &json_buffer)?;

-                let field_name = fields_ids_map.name(*field_id).unwrap();
-
-                *fields_distribution.entry(field_name.to_string()).or_default() += 1;
+                *fields_ids_distribution.entry(*field_id).or_insert(0) += 1;
            }

            // We use the extracted/generated user id as the key for this document.
@ -326,6 +333,11 @@ impl Transform<'_, '_> {
            documents_count += 1;
        }

+        for (field_id, count) in fields_ids_distribution {
+            let field_name = fields_ids_map.name(field_id).unwrap();
+            *fields_distribution.entry(field_name.to_string()).or_default() += count;
+        }
+
        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
            documents_seen: documents_count,
        });