Merge #3866

3866: Update charabia v0.8.0 r=dureuill a=ManyTheFish # Pull Request Update Charabia: - enhance Japanese segmentation - enhance Latin Tokenization - words containing `_` are now properly segmented into several words - brackets `{([])}` are no more considered as context separators so word separated by brackets are now considered near together for the proximity ranking rule - fixes #3815 - fixes #3778 - fixes [product#151](https://github.com/meilisearch/product/discussions/151) > Important note: now the float numbers are segmented around the `.` so `3.22` is segmented as [`3`, `.`, `22`] but the middle dot isn't considered as a hard separator, which means that if we search `3.22` we find documents containing `3.22` Co-authored-by: ManyTheFish <many@meilisearch.com>
2025-07-04 20:37:15 +02:00 · 2023-06-29 15:24:36 +00:00 · 2023-06-29 15:24:36 +00:00 · 661d1f90dc
commit 661d1f90dc
parent c9b3f80947 6ec7541026
14 changed files with 252 additions and 187 deletions
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -1,7 +1,7 @@
 use std::collections::{BTreeSet, HashMap, HashSet};
 use std::result::Result as StdResult;

-use charabia::{Tokenizer, TokenizerBuilder};
+use charabia::{Normalize, Tokenizer, TokenizerBuilder};
 use deserr::{DeserializeError, Deserr};
 use itertools::Itertools;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
@ -423,6 +423,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
        match self.stop_words {
            Setting::Set(ref stop_words) => {
                let current = self.index.stop_words(self.wtxn)?;
+
+                // Apply an unlossy normalization on stop_words
+                let stop_words = stop_words
+                    .iter()
+                    .map(|w| w.as_str().normalize(&Default::default()).into_owned());
+
                // since we can't compare a BTreeSet with an FST we are going to convert the
                // BTreeSet to an FST and then compare bytes per bytes the two FSTs.
                let fst = fst::Set::from_iter(stop_words)?;
@ -446,7 +452,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
    fn update_synonyms(&mut self) -> Result<bool> {
        match self.synonyms {
            Setting::Set(ref synonyms) => {
-                fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
+                fn normalize(tokenizer: &Tokenizer, text: &str) -> Vec<String> {
                    tokenizer
                        .tokenize(text)
                        .filter_map(|token| {
@ -647,7 +653,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
    fn update_exact_words(&mut self) -> Result<()> {
        match self.exact_words {
            Setting::Set(ref mut words) => {
-                fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
+                fn normalize(tokenizer: &Tokenizer, text: &str) -> String {
                    tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
                }