feat(index): update fields distribution in clear & delete operations

fixes after review bump the version of the tokenizer implement a first version of the stop_words The front must provide a BTreeSet containing the stop words The stop_words are set at None if an empty Set is provided add the stop-words in the http-ui interface Use maplit in the test and remove all the useless drop(rtxn) at the end of all tests Integrate the stop_words in the querytree remove the stop_words from the querytree except if it was a prefix or a typo more fixes after review
2025-07-04 20:37:15 +02:00 · 2021-04-01 10:07:16 +03:00 · 2021-04-01 10:07:16 +03:00 · 2658c5c545
commit 2658c5c545
parent 27c7ab6e00
7 changed files with 128 additions and 34 deletions
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -1,4 +1,5 @@
 use std::borrow::Cow;
+use std::collections::HashMap;
 use std::fs::File;
 use std::io::{Read, Seek, SeekFrom};
 use std::iter::Peekable;
@ -10,11 +11,10 @@ use log::info;
 use roaring::RoaringBitmap;
 use serde_json::{Map, Value};

-use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId};
+use crate::{Index, BEU32, MergeFn, FieldsIdsMap, ExternalDocumentsIds, FieldId, FieldsDistribution};
 use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use super::merge_function::merge_two_obkvs;
 use super::{create_writer, create_sorter, IndexDocumentsMethod};
-use crate::index::FieldsDistribution;

 const DEFAULT_PRIMARY_KEY_NAME: &str = "id";

@ -137,6 +137,8 @@ impl Transform<'_, '_> {
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
        let mut documents_count = 0;

+        let mut fields_ids_distribution = HashMap::new();
+
        for result in documents {
            let document = result?;

@ -151,9 +153,9 @@ impl Transform<'_, '_> {

            // We prepare the fields ids map with the documents keys.
            for (key, _value) in &document {
-                fields_ids_map.insert(&key).context("field id limit reached")?;
+                let field_id = fields_ids_map.insert(&key).context("field id limit reached")?;

-                *fields_distribution.entry(key.to_owned()).or_default() += 1;
+                *fields_ids_distribution.entry(field_id).or_insert(0) += 1;
            }

            // We retrieve the user id from the document based on the primary key name,
@ -196,6 +198,11 @@ impl Transform<'_, '_> {
            documents_count += 1;
        }

+        for (field_id, count) in fields_ids_distribution {
+            let field_name = fields_ids_map.name(field_id).unwrap();
+            *fields_distribution.entry(field_name.to_string()).or_default() += count;
+        }
+
        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
            documents_seen: documents_count,
        });
@ -277,6 +284,8 @@ impl Transform<'_, '_> {
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
        let mut documents_count = 0;

+        let mut fields_ids_distribution = HashMap::new();
+
        let mut record = csv::StringRecord::new();
        while csv.read_record(&mut record)? {
            obkv_buffer.clear();
@ -316,9 +325,7 @@ impl Transform<'_, '_> {
                serde_json::to_writer(&mut json_buffer, &field)?;
                writer.insert(*field_id, &json_buffer)?;

-                let field_name = fields_ids_map.name(*field_id).unwrap();
-
-                *fields_distribution.entry(field_name.to_string()).or_default() += 1;
+                *fields_ids_distribution.entry(*field_id).or_insert(0) += 1;
            }

            // We use the extracted/generated user id as the key for this document.
@ -326,6 +333,11 @@ impl Transform<'_, '_> {
            documents_count += 1;
        }

+        for (field_id, count) in fields_ids_distribution {
+            let field_name = fields_ids_map.name(field_id).unwrap();
+            *fields_distribution.entry(field_name.to_string()).or_default() += count;
+        }
+
        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
            documents_seen: documents_count,
        });