Merge #3866

3866: Update charabia v0.8.0 r=dureuill a=ManyTheFish # Pull Request Update Charabia: - enhance Japanese segmentation - enhance Latin Tokenization - words containing `_` are now properly segmented into several words - brackets `{([])}` are no more considered as context separators so word separated by brackets are now considered near together for the proximity ranking rule - fixes #3815 - fixes #3778 - fixes [product#151](https://github.com/meilisearch/product/discussions/151) > Important note: now the float numbers are segmented around the `.` so `3.22` is segmented as [`3`, `.`, `22`] but the middle dot isn't considered as a hard separator, which means that if we search `3.22` we find documents containing `3.22` Co-authored-by: ManyTheFish <many@meilisearch.com>
2025-07-04 20:37:15 +02:00 · 2023-06-29 15:24:36 +00:00 · 2023-06-29 15:24:36 +00:00 · 661d1f90dc
commit 661d1f90dc
parent c9b3f80947 6ec7541026
14 changed files with 252 additions and 187 deletions
--- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
+++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs
@ -128,10 +128,10 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
        .map(|reader| (documents_ids, reader, script_language_docids))
 }

-fn extract_tokens_from_document<T: AsRef<[u8]>>(
+fn extract_tokens_from_document(
    obkv: &KvReader<FieldId>,
    searchable_fields: &Option<HashSet<FieldId>>,
-    tokenizer: &Tokenizer<T>,
+    tokenizer: &Tokenizer,
    max_positions_per_attributes: u32,
    buffers: &mut Buffers,
    script_language_word_count: &mut HashMap<Script, Vec<(Language, usize)>>,
--- a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
+++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/initial/word_docids.snap
@ -1,18 +1,21 @@
 ---
 source: milli/src/update/index_documents/mod.rs
 ---
+0                [1, ]
 1                [2, ]
-10.0             [1, ]
+10               [1, ]
 12               [0, ]
 1344             [3, ]
 2                [0, ]
 23               [5, ]
-25.99            [2, ]
-3.5              [0, ]
+25               [2, ]
+3                [0, ]
 35               [5, ]
 4                [4, ]
 42               [0, 5, ]
 456              [1, ]
+5                [0, ]
+99               [2, ]
 adams            [5, ]
 adventure        [1, ]
 alice            [2, ]
@ -29,7 +32,7 @@ galaxy           [5, ]
 guide            [5, ]
 half             [4, ]
 harry            [4, ]
-hitchhiker'      [5, ]
+hitchhiker       [5, ]
 hobbit           [3, ]
 in               [2, ]
 j                [3, 4, ]
--- a/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
+++ b/milli/src/update/index_documents/snapshots/mod.rs/simple_documents_replace/updated/word_docids.snap
@ -1,19 +1,22 @@
 ---
 source: milli/src/update/index_documents/mod.rs
 ---
+0                [1, 7, ]
 1                [2, ]
-10.0             [1, 7, ]
+10               [1, 7, ]
 12               [0, 8, ]
 1344             [3, ]
 1813             [8, ]
 2                [0, 8, ]
 23               [5, ]
-25.99            [2, ]
-3.5              [0, 8, ]
+25               [2, ]
+3                [0, 8, ]
 35               [5, ]
 4                [4, 6, ]
 42               [0, 5, 8, ]
 456              [1, 7, ]
+5                [0, 8, ]
+99               [2, ]
 adams            [5, ]
 adventure        [1, 7, ]
 alice            [2, ]
@ -31,7 +34,7 @@ galaxy           [5, ]
 guide            [5, ]
 half             [4, 6, ]
 harry            [4, 6, ]
-hitchhiker'      [5, ]
+hitchhiker       [5, ]
 hobbit           [3, ]
 in               [2, ]
 j                [3, 4, 6, 8, ]
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -1,7 +1,7 @@
 use std::collections::{BTreeSet, HashMap, HashSet};
 use std::result::Result as StdResult;

-use charabia::{Tokenizer, TokenizerBuilder};
+use charabia::{Normalize, Tokenizer, TokenizerBuilder};
 use deserr::{DeserializeError, Deserr};
 use itertools::Itertools;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
@ -423,6 +423,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        match self.stop_words {
            Setting::Set(ref stop_words) => {
                let current = self.index.stop_words(self.wtxn)?;
+
+                // Apply an unlossy normalization on stop_words
+                let stop_words = stop_words
+                    .iter()
+                    .map(|w| w.as_str().normalize(&Default::default()).into_owned());
+
                // since we can't compare a BTreeSet with an FST we are going to convert the
                // BTreeSet to an FST and then compare bytes per bytes the two FSTs.
                let fst = fst::Set::from_iter(stop_words)?;
@ -446,7 +452,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
    fn update_synonyms(&mut self) -> Result<bool> {
        match self.synonyms {
            Setting::Set(ref synonyms) => {
-                fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
+                fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
                    tokenizer
                        .tokenize(text)
                        .filter_map(|token| {
@ -647,7 +653,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
    fn update_exact_words(&mut self) -> Result<()> {
        match self.exact_words {
            Setting::Set(ref mut words) => {
-                fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
+                fn normalize(tokenizer: &Tokenizer, text: &str) -> String {
                    tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
                }

--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap
+++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_docids.snap
@ -1,27 +1,28 @@
 ---
 source: milli/src/update/delete_documents.rs
 ---
-1_36             [3, ]
-1_37             [4, ]
-1_38             [5, ]
-1_39             [6, ]
-1_40             [7, ]
-1_41             [8, ]
-1_42             [9, ]
-1_43             [10, ]
-1_44             [11, ]
-1_45             [12, ]
-1_46             [13, ]
-1_47             [14, ]
-1_5              [1, ]
-1_52             [15, ]
-1_57             [16, ]
-1_58             [17, ]
-1_68             [18, ]
-1_69             [19, ]
-1_7              [2, ]
-1_71             [21, ]
-2.2              [21, ]
+1                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, ]
+2                [21, ]
+36               [3, ]
+37               [4, ]
+38               [5, ]
+39               [6, ]
+40               [7, ]
+41               [8, ]
+42               [9, ]
+43               [10, ]
+44               [11, ]
+45               [12, ]
+46               [13, ]
+47               [14, ]
+5                [1, ]
+52               [15, ]
+57               [16, ]
+58               [17, ]
+68               [18, ]
+69               [19, ]
+7                [2, ]
+71               [21, ]
 abstract         [2, 6, 10, 13, 14, 15, 16, 17, ]
 aquarium         [5, ]
 art              [4, 5, 8, 9, 10, 12, 17, ]
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap
+++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_hard/word_pair_proximity_docids.snap
@ -1,4 +1,25 @@
 ---
 source: milli/src/update/delete_documents.rs
 ---
+1  1                36               [3, ]
+1  1                37               [4, ]
+1  1                38               [5, ]
+1  1                39               [6, ]
+1  1                40               [7, ]
+1  1                41               [8, ]
+1  1                42               [9, ]
+1  1                43               [10, ]
+1  1                44               [11, ]
+1  1                45               [12, ]
+1  1                46               [13, ]
+1  1                47               [14, ]
+1  1                5                [1, ]
+1  1                52               [15, ]
+1  1                57               [16, ]
+1  1                58               [17, ]
+1  1                68               [18, ]
+1  1                69               [19, ]
+1  1                7                [2, ]
+1  1                71               [21, ]
+1  2                2                [21, ]

--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap
+++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_docids.snap
@ -1,31 +1,31 @@
 ---
 source: milli/src/update/delete_documents.rs
 ---
-1.2              [20, 22, ]
-1_36             [3, ]
-1_37             [4, ]
-1_38             [5, ]
-1_39             [6, ]
-1_4              [0, ]
-1_40             [7, ]
-1_41             [8, ]
-1_42             [9, ]
-1_43             [10, ]
-1_44             [11, ]
-1_45             [12, ]
-1_46             [13, ]
-1_47             [14, ]
-1_5              [1, ]
-1_52             [15, ]
-1_57             [16, ]
-1_58             [17, ]
-1_68             [18, ]
-1_69             [19, ]
-1_7              [2, ]
-1_70             [20, ]
-1_71             [21, ]
-1_72             [22, ]
-2.2              [21, ]
+1                [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, ]
+2                [20, 21, 22, ]
+36               [3, ]
+37               [4, ]
+38               [5, ]
+39               [6, ]
+4                [0, ]
+40               [7, ]
+41               [8, ]
+42               [9, ]
+43               [10, ]
+44               [11, ]
+45               [12, ]
+46               [13, ]
+47               [14, ]
+5                [1, ]
+52               [15, ]
+57               [16, ]
+58               [17, ]
+68               [18, ]
+69               [19, ]
+7                [2, ]
+70               [20, ]
+71               [21, ]
+72               [22, ]
 abstract         [2, 6, 10, 13, 14, 15, 16, 17, ]
 aquarium         [5, ]
 art              [4, 5, 8, 9, 10, 12, 17, ]
--- a/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap
+++ b/milli/src/update/snapshots/delete_documents.rs/filtered_placeholder_search_should_not_return_deleted_documents/always_soft/word_pair_proximity_docids.snap
@ -1,4 +1,29 @@
 ---
 source: milli/src/update/delete_documents.rs
 ---
+1  1                2                [20, 22, ]
+1  1                36               [3, ]
+1  1                37               [4, ]
+1  1                38               [5, ]
+1  1                39               [6, ]
+1  1                4                [0, ]
+1  1                40               [7, ]
+1  1                41               [8, ]
+1  1                42               [9, ]
+1  1                43               [10, ]
+1  1                44               [11, ]
+1  1                45               [12, ]
+1  1                46               [13, ]
+1  1                47               [14, ]
+1  1                5                [1, ]
+1  1                52               [15, ]
+1  1                57               [16, ]
+1  1                58               [17, ]
+1  1                68               [18, ]
+1  1                69               [19, ]
+1  1                7                [2, ]
+1  1                70               [20, ]
+1  1                71               [21, ]
+1  1                72               [22, ]
+1  2                2                [21, ]