diff --git a/meilisearch-core/src/automaton/mod.rs b/meilisearch-core/src/automaton/mod.rs index c47645041..27b63f25c 100644 --- a/meilisearch-core/src/automaton/mod.rs +++ b/meilisearch-core/src/automaton/mod.rs @@ -2,13 +2,3 @@ mod dfa; pub use self::dfa::{build_dfa, build_prefix_dfa, build_exact_dfa}; - -pub fn normalize_str(string: &str) -> String { - let mut string = string.to_lowercase(); - - if !string.contains(is_cjk) { - string = deunicode::deunicode_with_tofu(&string, ""); - } - - string -} diff --git a/meilisearch-core/src/query_tree.rs b/meilisearch-core/src/query_tree.rs index 4a3a622b2..4b4772036 100644 --- a/meilisearch-core/src/query_tree.rs +++ b/meilisearch-core/src/query_tree.rs @@ -7,13 +7,14 @@ use std::{cmp, fmt, iter::once}; use fst::{IntoStreamer, Streamer}; use itertools::{EitherOrBoth, merge_join_by}; -use meilisearch_tokenizer::split_query_string; -use sdset::{Set, SetBuf, SetOperation}; use log::debug; +use meilisearch_tokenizer::Token; +use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; +use sdset::{Set, SetBuf, SetOperation}; use crate::database::MainT; use crate::{store, DocumentId, DocIndex, MResult, FstSetCow}; -use crate::automaton::{normalize_str, build_dfa, build_prefix_dfa, build_exact_dfa}; +use crate::automaton::{build_dfa, build_prefix_dfa, build_exact_dfa}; use crate::QueryWordsMapper; #[derive(Clone, PartialEq, Eq, Hash)] @@ -146,7 +147,7 @@ fn split_best_frequency<'a>(reader: &heed::RoTxn, ctx: &Context, word: &' } fn fetch_synonyms(reader: &heed::RoTxn, ctx: &Context, words: &[&str]) -> MResult>> { - let words = normalize_str(&words.join(" ")); + let words = &words.join(" "); let set = ctx.synonyms.synonyms_fst(reader, words.as_bytes())?; let mut strings = Vec::new(); @@ -174,15 +175,25 @@ where I: IntoIterator, const MAX_NGRAM: usize = 3; +fn split_query_string(s: &str) -> Vec<(usize, String)> { + // TODO: Use global instance instead + let analyzer = Analyzer::new(AnalyzerConfig::default()); + analyzer + .analyze(s) + .tokens() + .filter(|t| !t.is_stopword()) + .enumerate() + .map(|(i, Token { word, .. })| (i, word.to_string())) + .collect() +} + pub fn create_query_tree( reader: &heed::RoTxn, ctx: &Context, query: &str, ) -> MResult<(Operation, HashMap>)> { - let words = split_query_string(query).map(str::to_lowercase); - let words = words.filter(|w| !ctx.stop_words.contains(w)); - let words: Vec<_> = words.enumerate().collect(); + let words = split_query_string(query); let mut mapper = QueryWordsMapper::new(words.iter().map(|(_, w)| w)); diff --git a/meilisearch-core/src/raw_indexer.rs b/meilisearch-core/src/raw_indexer.rs index 471d0cfff..e234ca736 100644 --- a/meilisearch-core/src/raw_indexer.rs +++ b/meilisearch-core/src/raw_indexer.rs @@ -2,8 +2,9 @@ use std::borrow::Cow; use std::collections::{BTreeMap, HashMap}; use std::convert::TryFrom; -use deunicode::deunicode_with_tofu; use meilisearch_schema::IndexedPos; +use meilisearch_tokenizer::tokenizer::{Analyzer, AnalyzerConfig}; +use meilisearch_tokenizer::Token; use sdset::SetBuf; use crate::{DocIndex, DocumentId}; @@ -18,6 +19,7 @@ pub struct RawIndexer { stop_words: fst::Set, words_doc_indexes: BTreeMap>, docs_words: HashMap>, + analyzer: Analyzer, } pub struct Indexed<'a> { @@ -36,6 +38,7 @@ impl RawIndexer { stop_words, words_doc_indexes: BTreeMap::new(), docs_words: HashMap::new(), + analyzer: Analyzer::new(AnalyzerConfig::default()), } } } @@ -44,9 +47,12 @@ impl> RawIndexer { pub fn index_text(&mut self, id: DocumentId, indexed_pos: IndexedPos, text: &str) -> usize { let mut number_of_words = 0; - for token in Tokenizer::new(text) { + let analyzed_text = self.analyzer.analyze(text); + for (word_pos, (token_index, token)) in analyzed_text.tokens().enumerate().filter(|(_, t)| !t.is_separator()).enumerate() { let must_continue = index_token( token, + token_index, + word_pos, id, indexed_pos, self.word_limit, @@ -69,20 +75,47 @@ impl> RawIndexer { where I: IntoIterator, { - let iter = iter.into_iter(); - for token in SeqTokenizer::new(iter) { - let must_continue = index_token( - token, - id, - indexed_pos, - self.word_limit, - &self.stop_words, - &mut self.words_doc_indexes, - &mut self.docs_words, - ); + let mut token_index_offset = 0; + let mut byte_offset = 0; + let mut word_offset = 0; - if !must_continue { - break; + for s in iter.into_iter() { + let current_token_index_offset = token_index_offset; + let current_byte_offset = byte_offset; + let current_word_offset = word_offset; + + let analyzed_text = self.analyzer.analyze(s); + let tokens = analyzed_text + .tokens() + .enumerate() + .map(|(i, mut t)| { + t.byte_start = t.byte_start + current_byte_offset; + t.byte_end = t.byte_end + current_byte_offset; + (i + current_token_index_offset, t) + }) + .enumerate() + .map(|(i, t)| (i + current_word_offset, t)); + + for (word_pos, (token_index, token)) in tokens { + token_index_offset = token_index + 1; + word_offset = word_pos + 1; + byte_offset = token.byte_end + 1; + + let must_continue = index_token( + token, + token_index, + word_pos, + id, + indexed_pos, + self.word_limit, + &self.stop_words, + &mut self.words_doc_indexes, + &mut self.docs_words, + ); + + if !must_continue { + break; + } } } } @@ -114,6 +147,8 @@ impl> RawIndexer { fn index_token( token: Token, + position: usize, + word_pos: usize, id: DocumentId, indexed_pos: IndexedPos, word_limit: usize, @@ -123,20 +158,14 @@ fn index_token( ) -> bool where A: AsRef<[u8]>, { - if token.index >= word_limit { + if position >= word_limit { return false; } - let lower = token.word.to_lowercase(); - let token = Token { - word: &lower, - ..token - }; - - if !stop_words.contains(&token.word) { - match token_to_docindex(id, indexed_pos, token) { + if !stop_words.contains(&token.word.as_ref()) { + match token_to_docindex(id, indexed_pos, &token, word_pos) { Some(docindex) => { - let word = Vec::from(token.word); + let word = Vec::from(token.word.as_ref()); if word.len() <= WORD_LENGTH_LIMIT { words_doc_indexes @@ -144,20 +173,6 @@ where A: AsRef<[u8]>, .or_insert_with(Vec::new) .push(docindex); docs_words.entry(id).or_insert_with(Vec::new).push(word); - - if !lower.contains(is_cjk) { - let unidecoded = deunicode_with_tofu(&lower, ""); - if unidecoded != lower && !unidecoded.is_empty() { - let word = Vec::from(unidecoded); - if word.len() <= WORD_LENGTH_LIMIT { - words_doc_indexes - .entry(word.clone()) - .or_insert_with(Vec::new) - .push(docindex); - docs_words.entry(id).or_insert_with(Vec::new).push(word); - } - } - } } } None => return false, @@ -167,8 +182,8 @@ where A: AsRef<[u8]>, true } -fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: Token) -> Option { - let word_index = u16::try_from(token.word_index).ok()?; +fn token_to_docindex(id: DocumentId, indexed_pos: IndexedPos, token: &Token, word_index: usize) -> Option { + let word_index = u16::try_from(word_index).ok()?; let char_index = u16::try_from(token.char_index).ok()?; let char_length = u16::try_from(token.word.chars().count()).ok()?; diff --git a/meilisearch-http/src/helpers/meilisearch.rs b/meilisearch-http/src/helpers/meilisearch.rs index 78893c47a..dd5e2c79f 100644 --- a/meilisearch-http/src/helpers/meilisearch.rs +++ b/meilisearch-http/src/helpers/meilisearch.rs @@ -11,7 +11,6 @@ use meilisearch_core::criterion::*; use meilisearch_core::settings::RankingRule; use meilisearch_core::{Highlight, Index, RankedMap}; use meilisearch_schema::{FieldId, Schema}; -use meilisearch_tokenizer::is_cjk; use serde::{Deserialize, Serialize}; use serde_json::Value; use siphasher::sip::SipHasher; @@ -344,7 +343,7 @@ pub struct SearchResult { /// returns the start index and the length on the crop. fn aligned_crop(text: &str, match_index: usize, context: usize) -> (usize, usize) { - let is_word_component = |c: &char| c.is_alphanumeric() && !is_cjk(*c); + let is_word_component = |c: &char| c.is_alphanumeric() && !super::is_cjk(*c); let word_end_index = |mut index| { if text.chars().nth(index - 1).map_or(false, |c| is_word_component(&c)) { diff --git a/meilisearch-http/src/helpers/mod.rs b/meilisearch-http/src/helpers/mod.rs index 471336db9..9ba62a3a7 100644 --- a/meilisearch-http/src/helpers/mod.rs +++ b/meilisearch-http/src/helpers/mod.rs @@ -5,3 +5,22 @@ pub mod compression; pub use authentication::Authentication; pub use normalize_path::NormalizePath; + +pub fn is_cjk(c: char) -> bool { + (c >= '\u{1100}' && c <= '\u{11ff}') // Hangul Jamo + || (c >= '\u{2e80}' && c <= '\u{2eff}') // CJK Radicals Supplement + || (c >= '\u{2f00}' && c <= '\u{2fdf}') // Kangxi radical + || (c >= '\u{3000}' && c <= '\u{303f}') // Japanese-style punctuation + || (c >= '\u{3040}' && c <= '\u{309f}') // Japanese Hiragana + || (c >= '\u{30a0}' && c <= '\u{30ff}') // Japanese Katakana + || (c >= '\u{3100}' && c <= '\u{312f}') + || (c >= '\u{3130}' && c <= '\u{318F}') // Hangul Compatibility Jamo + || (c >= '\u{3200}' && c <= '\u{32ff}') // Enclosed CJK Letters and Months + || (c >= '\u{3400}' && c <= '\u{4dbf}') // CJK Unified Ideographs Extension A + || (c >= '\u{4e00}' && c <= '\u{9fff}') // CJK Unified Ideographs + || (c >= '\u{a960}' && c <= '\u{a97f}') // Hangul Jamo Extended-A + || (c >= '\u{ac00}' && c <= '\u{d7a3}') // Hangul Syllables + || (c >= '\u{d7b0}' && c <= '\u{d7ff}') // Hangul Jamo Extended-B + || (c >= '\u{f900}' && c <= '\u{faff}') // CJK Compatibility Ideographs + || (c >= '\u{ff00}' && c <= '\u{ffef}') // Full-width roman characters and half-width katakana +}