integration with new tokenizer wip

2025-07-01 02:48:31 +02:00 · 2020-11-19 18:23:08 +01:00 · 2020-11-19 18:23:08 +01:00 · 5e00842087
commit 5e00842087
parent 8a4d05b7bb
5 changed files with 94 additions and 60 deletions
--- a/meilisearch-core/src/automaton/mod.rs
+++ b/meilisearch-core/src/automaton/mod.rs
@ -2,13 +2,3 @@ mod dfa;


 pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa};
-
-pub fn normalize_str(string: &str) -> String {
-    let mut string = string.to_lowercase();
-
-    if !string.contains(is_cjk) {
-        string = deunicode::deunicode_with_tofu(&string, "");
-    }
-
-    string
-}
--- a/meilisearch-core/src/query_tree.rs
+++ b/meilisearch-core/src/query_tree.rs
@ -7,13 +7,14 @@ use std::{cmp, fmt, iter::once};

 use fst::{IntoStreamer, Streamer};
 use itertools::{EitherOrBoth, merge_join_by};
-use meilisearch_tokenizer::split_query_string;
-use sdset::{Set, SetBuf, SetOperation};
 use log::debug;
+use meilisearch_tokenizer::Token;
+use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
+use sdset::{Set, SetBuf, SetOperation};

 use crate::database::MainT;
 use crate::{store, DocumentId, DocIndex, MResult, FstSetCow};
-use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa};
+use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa};
 use crate::QueryWordsMapper;

 #[derive(Clone, PartialEq, Eq, Hash)]
@ -146,7 +147,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn<MainT>, ctx: &Context, word: &'
 }

 fn fetch_synonyms(reader: &heed::RoTxn<MainT>, ctx: &Context, words: &[&str]) -> MResult<Vec<Vec<String>>> {
-    let words = normalize_str(&words.join(" "));
+    let words = &words.join(" ");
    let set = ctx.synonyms.synonyms_fst(reader, words.as_bytes())?;

    let mut strings = Vec::new();
@ -174,15 +175,25 @@ where I: IntoIterator<Item=Operation>,

 const MAX_NGRAM: usize = 3;

+fn split_query_string(s: &str) -> Vec<(usize, String)> {
+    // TODO: Use global instance instead
+    let analyzer = Analyzer::new(AnalyzerConfig::default());
+    analyzer
+        .analyze(s)
+        .tokens()
+        .filter(|t| !t.is_stopword())
+        .enumerate()
+        .map(|(i, Token { word, .. })| (i, word.to_string()))
+        .collect()
+}
+
 pub fn create_query_tree(
    reader: &heed::RoTxn<MainT>,
    ctx: &Context,
    query: &str,
 ) -> MResult<(Operation, HashMap<QueryId, Range<usize>>)>
 {
-    let words = split_query_string(query).map(str::to_lowercase);
-    let words = words.filter(|w| !ctx.stop_words.contains(w));
-    let words: Vec<_> = words.enumerate().collect();
+    let words = split_query_string(query);

    let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w));

--- a/meilisearch-core/src/raw_indexer.rs
+++ b/meilisearch-core/src/raw_indexer.rs
@ -2,8 +2,9 @@ use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
 use std::convert::TryFrom;

-use deunicode::deunicode_with_tofu;
 use meilisearch_schema::IndexedPos;
+use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig};
+use meilisearch_tokenizer::Token;
 use sdset::SetBuf;

 use crate::{DocIndex, DocumentId};
@ -18,6 +19,7 @@ pub struct RawIndexer<A> {
    stop_words: fst::Set<A>,
    words_doc_indexes: BTreeMap<Word, Vec<DocIndex>>,
    docs_words: HashMap<DocumentId, Vec<Word>>,
+    analyzer: Analyzer,
 }

 pub struct Indexed<'a> {
@ -36,6 +38,7 @@ impl<A> RawIndexer<A> {
            stop_words,
            words_doc_indexes: BTreeMap::new(),
            docs_words: HashMap::new(),
+            analyzer: Analyzer::new(AnalyzerConfig::default()),
        }
    }
 }
@ -44,9 +47,12 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
    pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize {
        let mut number_of_words = 0;

-        for token in Tokenizer::new(text) {
+        let analyzed_text = self.analyzer.analyze(text);
+        for (word_pos, (token_index, token)) in  analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() {
            let must_continue = index_token(
                token,
+                token_index,
+                word_pos,
                id,
                indexed_pos,
                self.word_limit,
@ -69,20 +75,47 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {
    where
        I: IntoIterator<Item = &'s str>,
    {
-        let iter = iter.into_iter();
-        for token in SeqTokenizer::new(iter) {
-            let must_continue = index_token(
-                token,
-                id,
-                indexed_pos,
-                self.word_limit,
-                &self.stop_words,
-                &mut self.words_doc_indexes,
-                &mut self.docs_words,
-            );
+        let mut token_index_offset = 0;
+        let mut byte_offset = 0;
+        let mut word_offset = 0;

-            if !must_continue {
-                break;
+        for s in iter.into_iter() {
+            let current_token_index_offset = token_index_offset;
+            let current_byte_offset = byte_offset;
+            let current_word_offset = word_offset;
+
+            let analyzed_text = self.analyzer.analyze(s);
+            let tokens = analyzed_text
+                .tokens()
+                .enumerate()
+                .map(|(i, mut t)| {
+                    t.byte_start = t.byte_start + current_byte_offset;
+                    t.byte_end = t.byte_end + current_byte_offset;
+                    (i + current_token_index_offset, t)
+                })
+                .enumerate()
+                .map(|(i, t)| (i + current_word_offset, t));
+
+            for (word_pos, (token_index, token)) in tokens  {
+                token_index_offset = token_index + 1;
+                word_offset = word_pos + 1;
+                byte_offset = token.byte_end + 1;
+
+                let must_continue = index_token(
+                    token,
+                    token_index,
+                    word_pos,
+                    id,
+                    indexed_pos,
+                    self.word_limit,
+                    &self.stop_words,
+                    &mut self.words_doc_indexes,
+                    &mut self.docs_words,
+                );
+
+                if !must_continue {
+                    break;
+                }
            }
        }
    }
@ -114,6 +147,8 @@ impl<A: AsRef<[u8]>> RawIndexer<A> {

 fn index_token<A>(
    token: Token,
+    position: usize,
+    word_pos: usize,
    id: DocumentId,
    indexed_pos: IndexedPos,
    word_limit: usize,
@ -123,20 +158,14 @@ fn index_token<A>(
 ) -> bool
 where A: AsRef<[u8]>,
 {
-    if token.index >= word_limit {
+    if position >= word_limit {
        return false;
    }

-    let lower = token.word.to_lowercase();
-    let token = Token {
-        word: &lower,
-        ..token
-    };
-
-    if !stop_words.contains(&token.word) {
-        match token_to_docindex(id, indexed_pos, token) {
+    if !stop_words.contains(&token.word.as_ref()) {
+        match token_to_docindex(id, indexed_pos, &token, word_pos) {
            Some(docindex) => {
-                let word = Vec::from(token.word);
+                let word = Vec::from(token.word.as_ref());

                if word.len() <= WORD_LENGTH_LIMIT {
                    words_doc_indexes
@ -144,20 +173,6 @@ where A: AsRef<[u8]>,
                        .or_insert_with(Vec::new)
                        .push(docindex);
                    docs_words.entry(id).or_insert_with(Vec::new).push(word);
-
-                    if !lower.contains(is_cjk) {
-                        let unidecoded = deunicode_with_tofu(&lower, "");
-                        if unidecoded != lower && !unidecoded.is_empty() {
-                            let word = Vec::from(unidecoded);
-                            if word.len() <= WORD_LENGTH_LIMIT {
-                                words_doc_indexes
-                                    .entry(word.clone())
-                                    .or_insert_with(Vec::new)
-                                    .push(docindex);
-                                docs_words.entry(id).or_insert_with(Vec::new).push(word);
-                            }
-                        }
-                    }
                }
            }
            None => return false,
@ -167,8 +182,8 @@ where A: AsRef<[u8]>,
    true
 }

-fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option<DocIndex> {
-    let word_index = u16::try_from(token.word_index).ok()?;
+fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, word_index: usize) -> Option<DocIndex> {
+    let word_index = u16::try_from(word_index).ok()?;
    let char_index = u16::try_from(token.char_index).ok()?;
    let char_length = u16::try_from(token.word.chars().count()).ok()?;

--- a/meilisearch-http/src/helpers/meilisearch.rs
+++ b/meilisearch-http/src/helpers/meilisearch.rs
@ -11,7 +11,6 @@ use meilisearch_core::criterion::*;
 use meilisearch_core::settings::RankingRule;
 use meilisearch_core::{Highlight, Index, RankedMap};
 use meilisearch_schema::{FieldId, Schema};
-use meilisearch_tokenizer::is_cjk;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use siphasher::sip::SipHasher;
@ -344,7 +343,7 @@ pub struct SearchResult {

 /// returns the start index and the length on the crop.
 fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) {
-    let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c);
+    let is_word_component = |c: &char| c.is_alphanumeric() && !super::is_cjk(*c);

    let word_end_index = |mut index| {
        if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) {
--- a/meilisearch-http/src/helpers/mod.rs
+++ b/meilisearch-http/src/helpers/mod.rs
@ -5,3 +5,22 @@ pub mod compression;

 pub use authentication::Authentication;
 pub use normalize_path::NormalizePath;
+
+pub fn is_cjk(c: char) -> bool {
+    (c >= '\u{1100}' && c <= '\u{11ff}')  // Hangul Jamo
+        || (c >= '\u{2e80}' && c <= '\u{2eff}')  // CJK Radicals Supplement
+        || (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical
+        || (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation
+        || (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana
+        || (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana
+        || (c >= '\u{3100}' && c <= '\u{312f}')
+        || (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo
+        || (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months
+        || (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A
+        || (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs
+        || (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A
+        || (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables
+        || (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B
+        || (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs
+        || (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana
+}