feat(index): update fields distribution in clear & delete operations

fixes after review bump the version of the tokenizer implement a first version of the stop_words The front must provide a BTreeSet containing the stop words The stop_words are set at None if an empty Set is provided add the stop-words in the http-ui interface Use maplit in the test and remove all the useless drop(rtxn) at the end of all tests Integrate the stop_words in the querytree remove the stop_words from the querytree except if it was a prefix or a typo more fixes after review
2025-07-04 20:37:15 +02:00 · 2021-04-01 10:07:16 +03:00 · 2021-04-01 10:07:16 +03:00 · 2658c5c545
commit 2658c5c545
parent 27c7ab6e00
7 changed files with 128 additions and 34 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -10,7 +10,7 @@ use chrono::{Utc, DateTime};

 use crate::facet::FacetType;
 use crate::fields_ids_map::FieldsIdsMap;
-use crate::{default_criteria, Criterion, Search, FacetDistribution};
+use crate::{default_criteria, Criterion, Search, FacetDistribution, FieldsDistribution};
 use crate::{BEU32, DocumentId, FieldId, ExternalDocumentsIds};
 use crate::{
    RoaringBitmapCodec, RoaringBitmapLenCodec, BEU32StrCodec,
@ -34,8 +34,6 @@ pub const WORDS_PREFIXES_FST_KEY: &str = "words-prefixes-fst";
 const CREATED_AT_KEY: &str = "created-at";
 const UPDATED_AT_KEY: &str = "updated-at";

-pub type FieldsDistribution = HashMap<String, u64>;
-
 #[derive(Clone)]
 pub struct Index {
    /// The LMDB environment which this index is associated with.
@ -209,14 +207,14 @@ impl Index {

    /* fields distribution */

-    /// Writes the fields distribution which associate the field with the number of times
-    /// it occurs in the obkv documents.
+    /// Writes the fields distribution which associates every field name with
+    /// the number of times it occurs in the documents.
    pub fn put_fields_distribution(&self, wtxn: &mut RwTxn, distribution: &FieldsDistribution) -> heed::Result<()> {
-        self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, &distribution)
+        self.main.put::<_, Str, SerdeJson<FieldsDistribution>>(wtxn, FIELDS_DISTRIBUTION_KEY, distribution)
    }

-    /// Returns the fields distribution which associate the field with the number of times
-    /// it occurs in the obkv documents.
+    /// Returns the fields distribution which associates every field name with
+    /// the number of times it occurs in the documents.
    pub fn fields_distribution(&self, rtxn: &RoTxn) -> heed::Result<FieldsDistribution> {
        Ok(self.main.get::<_, Str, SerdeJson<FieldsDistribution>>(rtxn, FIELDS_DISTRIBUTION_KEY)?.unwrap_or_default())
    }
@ -472,35 +470,29 @@ mod tests {
    use crate::Index;
    use crate::update::{IndexDocuments, UpdateFormat};

-    fn prepare_index() -> Index {
+    #[test]
+    fn initial_fields_distribution() {
        let path = tempfile::tempdir().unwrap();
        let mut options = EnvOpenOptions::new();
        options.map_size(10 * 1024 * 1024); // 10 MB
        let index = Index::new(options, &path).unwrap();

        let mut wtxn = index.write_txn().unwrap();
-        let content = &br#"
-        { "name": "kevin" }
-        { "name": "bob", "age": 20 }
-        "#[..];
+        let content = &br#"[
+            { "name": "kevin" },
+            { "name": "bob", "age": 20 }
+        ]"#[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index, 0);
-        builder.update_format(UpdateFormat::JsonStream);
+        builder.update_format(UpdateFormat::Json);
        builder.execute(content, |_, _| ()).unwrap();
        wtxn.commit().unwrap();

-        index
-    }
-
-    #[test]
-    fn initial_fields_distribution() {
-        let index = prepare_index();
-
        let rtxn = index.read_txn().unwrap();

        let fields_distribution = index.fields_distribution(&rtxn).unwrap();
        assert_eq!(fields_distribution, hashmap!{
+            "name".to_string() => 2,
            "age".to_string() => 1,
-            "name".to_string() => 2
        });
    }
 }