Merge pull request #256 from meilisearch/fix-tokenizer

Fix the tokenizer to make it work with unicode chars
2025-05-25 09:03:59 +02:00 · 2019-11-04 17:15:17 +01:00 · 2019-11-04 17:15:17 +01:00 · 89f30ad47e
commit 89f30ad47e
parent 1d4e98410a 3b1cbed238
3 changed files with 65 additions and 72 deletions
--- a/meilidb-core/src/raw_indexer.rs
+++ b/meilidb-core/src/raw_indexer.rs
@ -37,54 +37,8 @@ impl RawIndexer {
    pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) -> usize {
        let mut number_of_words = 0;
        let lowercase_text = text.to_lowercase();
        let deunicoded = deunicode_with_tofu(&lowercase_text, "");
-        // TODO compute the deunicoded version after the cjk check
+        for token in Tokenizer::new(text) {
        let next = if !lowercase_text.contains(is_cjk) && lowercase_text != deunicoded {
            Some(deunicoded)
        } else {
            None
        };
        let iter = Some(lowercase_text).into_iter().chain(next);
        for text in iter {
            // we must not count 2 times the same words
            number_of_words = 0;
            for token in Tokenizer::new(&text) {
                let must_continue = index_token(
                    token,
                    id,
                    attr,
                    self.word_limit,
                    &self.stop_words,
                    &mut self.words_doc_indexes,
                    &mut self.docs_words,
                );
                if !must_continue {
                    break;
                }
                number_of_words += 1;
            }
        }
        number_of_words
    }
    pub fn index_text_seq<'a, I, IT>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
    where
        I: IntoIterator<Item = &'a str, IntoIter = IT>,
        IT: Iterator<Item = &'a str> + Clone,
    {
        // TODO serialize this to one call to the SeqTokenizer loop
        let lowercased: Vec<_> = iter.into_iter().map(str::to_lowercase).collect();
        let iter = lowercased.iter().map(|t| t.as_str());
        for token in SeqTokenizer::new(iter) {
            let must_continue = index_token(
                token,
                id,
@ -95,27 +49,21 @@ impl RawIndexer {
                &mut self.docs_words,
            );
            number_of_words += 1;
            if !must_continue {
                break;
            }
        }
-        let deunicoded: Vec<_> = lowercased
+        number_of_words
-            .into_iter()
+    }
            .map(|lowercase_text| {
                if lowercase_text.contains(is_cjk) {
                    return lowercase_text;
                }
                let deunicoded = deunicode_with_tofu(&lowercase_text, "");
                if lowercase_text != deunicoded {
                    deunicoded
                } else {
                    lowercase_text
                }
            })
            .collect();
        let iter = deunicoded.iter().map(|t| t.as_str());
    pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
    where
        I: IntoIterator<Item = &'a str>,
    {
        let iter = iter.into_iter();
        for token in SeqTokenizer::new(iter) {
            let must_continue = index_token(
                token,
@ -170,6 +118,12 @@ fn index_token(
        return false;
    }
    let lower = token.word.to_lowercase();
    let token = Token {
        word: &lower,
        ..token
    };
    if !stop_words.contains(&token.word) {
        match token_to_docindex(id, attr, token) {
            Some(docindex) => {
@ -182,6 +136,28 @@ fn index_token(
            }
            None => return false,
        }
        if !lower.contains(is_cjk) {
            let unidecoded = deunicode_with_tofu(&lower, "");
            if unidecoded != lower && !unidecoded.is_empty() {
                let token = Token {
                    word: &unidecoded,
                    ..token
                };
                match token_to_docindex(id, attr, token) {
                    Some(docindex) => {
                        let word = Vec::from(token.word);
                        words_doc_indexes
                            .entry(word.clone())
                            .or_insert_with(Vec::new)
                            .push(docindex);
                        docs_words.entry(id).or_insert_with(Vec::new).push(word);
                    }
                    None => return false,
                }
            }
        }
    }
    true
@ -224,10 +200,8 @@ mod tests {
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
        // with the ugly apostrophe...
        assert!(words_doc_indexes
-            .get(&"l’éteindre".to_owned().into_bytes())
+            .get(&"éteindre".to_owned().into_bytes())
            .is_some());
    }
@ -248,10 +222,8 @@ mod tests {
        assert!(words_doc_indexes.get(&b"aspirateur"[..]).is_some());
        assert!(words_doc_indexes.get(&b"ai"[..]).is_some());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
        // with the ugly apostrophe...
        assert!(words_doc_indexes
-            .get(&"l’éteindre".to_owned().into_bytes())
+            .get(&"éteindre".to_owned().into_bytes())
            .is_some());
    }
@ -277,10 +249,26 @@ mod tests {
        assert!(words_doc_indexes.get(&b"ai"[..]).is_none());
        assert!(words_doc_indexes.get(&b"de"[..]).is_none());
        assert!(words_doc_indexes.get(&b"eteindre"[..]).is_some());
        // with the ugly apostrophe...
        assert!(words_doc_indexes
-            .get(&"l’éteindre".to_owned().into_bytes())
+            .get(&"éteindre".to_owned().into_bytes())
            .is_some());
    }
    #[test]
    fn no_empty_unidecode() {
        let mut indexer = RawIndexer::new(fst::Set::default());
        let docid = DocumentId(0);
        let attr = SchemaAttr(0);
        let text = "🇯🇵";
        indexer.index_text(docid, attr, text);
        let Indexed {
            words_doc_indexes, ..
        } = indexer.build();
        assert!(words_doc_indexes
            .get(&"🇯🇵".to_owned().into_bytes())
            .is_some());
    }
 }
--- a/meilidb-tokenizer/Cargo.toml
+++ b/meilidb-tokenizer/Cargo.toml
@ -5,4 +5,5 @@ authors = ["Kerollmops <renault.cle@gmail.com>"]
 edition = "2018"
 [dependencies]
 deunicode = "1.0.0"
 slice-group-by = "0.2.4"
--- a/meilidb-tokenizer/src/lib.rs
+++ b/meilidb-tokenizer/src/lib.rs
@ -1,4 +1,5 @@
 use self::SeparatorCategory::*;
 use deunicode::deunicode_char;
 use slice_group_by::StrGroupBy;
 use std::iter::Peekable;
@ -43,7 +44,10 @@ fn is_separator(c: char) -> bool {
 fn classify_separator(c: char) -> Option<SeparatorCategory> {
    match c {
-        ' ' | '-' | '_' | '\'' | ':' | '"' => Some(Soft),
+        c if c.is_whitespace() => Some(Soft), // whitespaces
        c if deunicode_char(c) == Some("'") => Some(Soft), // quotes
        c if deunicode_char(c) == Some("\"") => Some(Soft), // double quotes
        '-' | '_' | '\'' | ':' => Some(Soft),
        '.' | ';' | ',' | '!' | '?' | '(' | ')' => Some(Hard),
        _ => None,
    }