diff --git a/benchmarks/benches/formatting.rs b/benchmarks/benches/formatting.rs index 5045df268..25e88ffeb 100644 --- a/benchmarks/benches/formatting.rs +++ b/benchmarks/benches/formatting.rs @@ -1,5 +1,5 @@ use criterion::{criterion_group, criterion_main}; -use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::tokenizer::Tokenize; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; #[cfg(target_os = "linux")] @@ -52,9 +52,7 @@ fn bench_formatting(c: &mut criterion::Criterion) { for conf in confs { group.bench_function(conf.name, |b| { b.iter(|| { - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let analyzed = analyzer.analyze(&conf.text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = conf.text.tokenize().collect(); let mut matcher = conf.matching_words.build(&tokens[..], conf.text); matcher.format(option.clone()); }) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 641f82046..57a78b41e 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -19,7 +19,7 @@ use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; use milli::documents::DocumentBatchReader; -use milli::tokenizer::{Analyzer, AnalyzerConfig}; +use milli::tokenizer::{Tokenizer, TokenizerBuilder}; use milli::update::UpdateIndexingStep::*; use milli::update::{ ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, @@ -139,17 +139,16 @@ pub struct IndexerOpt { pub max_positions_per_attributes: Option, } -struct Highlighter<'a, A> { - analyzer: Analyzer<'a, A>, +struct Highlighter<'s, A> { + tokenizer: Tokenizer<'s, A>, } -impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { - fn new(stop_words: &'a fst::Set) -> Self { - let mut config = AnalyzerConfig::default(); - config.stop_words(stop_words); - let analyzer = Analyzer::new(config); +impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { + fn new(stop_words: &'s fst::Set) -> Self { + let mut builder = TokenizerBuilder::new(); + builder.stop_words(stop_words); - Self { analyzer } + Self { tokenizer: builder.build() } } fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { @@ -158,9 +157,8 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { Value::Bool(boolean) => Value::Bool(boolean), Value::Number(number) => Value::Number(number), Value::String(old_string) => { - let analyzed = self.analyzer.analyze(&old_string); - let analyzed: Vec<_> = analyzed.tokens().collect(); - let mut matcher = matcher_builder.build(&analyzed[..], &old_string); + let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect(); + let mut matcher = matcher_builder.build(&tokens[..], &old_string); let format_options = FormatOptions { highlight: true, crop: Some(10) }; diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 696384a01..d19ff03a9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -9,18 +9,18 @@ bimap = { version = "0.6.2", features = ["serde"] } bincode = "1.3.3" bstr = "0.2.17" byteorder = "1.4.3" +charabia = "0.5.0" concat-arrays = "0.1.2" crossbeam-channel = "0.5.2" either = "1.6.1" +flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" -flatten-serde-json = { path = "../flatten-serde-json" } -grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } geoutils = "0.4.1" +grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } -meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" } memmap2 = "0.5.3" obkv = "0.2.0" once_cell = "1.10.0" diff --git a/milli/src/lib.rs b/milli/src/lib.rs index e718dccae..f28677ed8 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -21,7 +21,7 @@ pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; use serde_json::{Map, Value}; -pub use {heed, meilisearch_tokenizer as tokenizer}; +pub use {charabia as tokenizer, heed}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::criterion::{default_criteria, Criterion, CriterionError}; diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 84b47bba5..71fbfd794 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -3,8 +3,8 @@ use std::collections::BTreeMap; use std::fmt; use std::ops::{Index, IndexMut}; +use charabia::Token; use levenshtein_automata::{Distance, DFA}; -use meilisearch_tokenizer::Token; use crate::search::build_dfa; @@ -99,13 +99,13 @@ impl MatchingWord { /// Returns the lenght in chars of the match in case of the token matches the term. pub fn match_token(&self, token: &Token) -> Option { - match self.dfa.eval(token.text()) { + match self.dfa.eval(token.lemma()) { Distance::Exact(t) if t <= self.typo => { if self.prefix { - let len = bytes_to_highlight(token.text(), &self.word); - Some(token.num_chars_from_bytes(len)) + let len = bytes_to_highlight(token.lemma(), &self.word); + Some(token.original_lengths(len).0) } else { - Some(token.num_chars_from_bytes(token.text().len())) + Some(token.original_lengths(token.lemma().len()).0) } } _otherwise => None, @@ -262,7 +262,7 @@ mod tests { use std::borrow::Cow; use std::str::from_utf8; - use meilisearch_tokenizer::TokenKind; + use charabia::TokenKind; use super::*; use crate::MatchingWords; @@ -344,11 +344,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("word"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("word"), + char_end: "word".chars().count(), byte_end: "word".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 3, ids: &[2] }) @@ -357,11 +356,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("nyc"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("nyc"), + char_end: "nyc".chars().count(), byte_end: "nyc".len(), - char_map: None, + ..Default::default() }) .next(), None @@ -370,11 +368,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("world"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("world"), + char_end: "world".chars().count(), byte_end: "world".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 5, ids: &[2] }) @@ -383,11 +380,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("splitted"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("splitted"), + char_end: "splitted".chars().count(), byte_end: "splitted".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 5, ids: &[0] }) @@ -396,11 +392,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("thisnew"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("thisnew"), + char_end: "thisnew".chars().count(), byte_end: "thisnew".len(), - char_map: None, + ..Default::default() }) .next(), None @@ -409,11 +404,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("borld"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("borld"), + char_end: "borld".chars().count(), byte_end: "borld".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 5, ids: &[2] }) @@ -422,11 +416,10 @@ mod tests { matching_words .match_token(&Token { kind: TokenKind::Word, - word: Cow::Borrowed("wordsplit"), - byte_start: 0, - char_index: 0, + lemma: Cow::Borrowed("wordsplit"), + char_end: "wordsplit".chars().count(), byte_end: "wordsplit".len(), - char_map: None, + ..Default::default() }) .next(), Some(MatchType::Full { char_len: 4, ids: &[2] }) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index d89e7dcb6..85e77e15b 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -1,8 +1,8 @@ use std::borrow::Cow; +use charabia::{SeparatorKind, Token}; use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; pub use matching_words::{MatchingWord, MatchingWords}; -use meilisearch_tokenizer::token::{SeparatorKind, Token}; use serde::Serialize; pub mod matching_words; @@ -168,13 +168,13 @@ impl<'t> Matcher<'t, '_> { let current_token_position = *token_position; let current_word_position = *word_position; *token_position += 1; - if token.is_separator().is_none() { + if !token.is_separator() { *word_position += 1; } Some((current_token_position, current_word_position, token)) }) - .filter(|(_, _, token)| token.is_separator().is_none()); + .filter(|(_, _, token)| !token.is_separator()); while let Some((token_position, word_position, word)) = words_positions.next() { for match_type in self.matching_words.match_token(word) { @@ -243,8 +243,8 @@ impl<'t> Matcher<'t, '_> { let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.is_separator()); - let after_token = after_tokens.peek().map(|t| t.is_separator()); + let before_token = before_tokens.peek().map(|t| t.separator_kind()); + let after_token = after_tokens.peek().map(|t| t.separator_kind()); match (before_token, after_token) { // we can expand both sides. @@ -470,7 +470,7 @@ impl<'t> Matcher<'t, '_> { #[cfg(test)] mod tests { - use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; + use charabia::Tokenize; use super::*; use crate::search::matches::matching_words::MatchingWord; @@ -490,30 +490,26 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: false, crop: None }; // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop and no highlight should return complete text. assert_eq!(&matcher.format(format_options), &text); @@ -524,44 +520,38 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: None }; // empty text. let text = ""; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text, because there is no matches. assert_eq!(&matcher.format(format_options), &text); // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!( @@ -580,30 +570,26 @@ mod tests { let matching_words = MatchingWords::new(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: None }; // Text containing prefix match. let text = "Ŵôřlḑôle"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑôle"); // Text containing unicode match. let text = "Ŵôřlḑ"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Ŵôřlḑ"); // Text containing unicode match. let text = "Westfália"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. assert_eq!(&matcher.format(format_options), "Westfália"); @@ -614,28 +600,24 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: false, crop: Some(10) }; // empty text. let text = ""; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( @@ -645,8 +627,7 @@ mod tests { // Text without any match starting by a separator. let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 first words with a marker at the end. assert_eq!( @@ -656,19 +637,17 @@ mod tests { // Test phrase propagation let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(format_options), - "…Split The World is a book written by Emily Henry…" + "… Split The World is a book written by Emily Henry…", ); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( @@ -678,8 +657,7 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // no highlight should return 10 last words with a marker at the start. assert_eq!( @@ -689,8 +667,7 @@ mod tests { // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -700,8 +677,7 @@ mod tests { // Text containing matches with diferent density. let text = "split void the void void world void void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -711,8 +687,7 @@ mod tests { // Text containing matches with same word. let text = "split split split split split split void void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -726,28 +701,24 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: Some(10) }; // empty text. let text = ""; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ""); // text containing only separators. let text = ":-)"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!(&matcher.format(format_options), ":-)"); // Text without any match. let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 first words with a marker at the end. assert_eq!( @@ -757,8 +728,7 @@ mod tests { // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!( @@ -768,16 +738,14 @@ mod tests { // Text containing all matches. let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // both should return 10 last words with a marker at the start and highlighted matches. assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: The Love That Split The World."); // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); // crop should return 10 last words with a marker at the start. assert_eq!( @@ -792,11 +760,9 @@ mod tests { let matching_words = matching_words(); let builder = MatcherBuilder::from_matching_words(matching_words); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let text = "void void split the world void void."; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); // set a smaller crop size let format_options = FormatOptions { highlight: false, crop: Some(2) }; @@ -847,13 +813,11 @@ mod tests { let mut builder = MatcherBuilder::from_matching_words(matching_words); builder.highlight_prefix("_".to_string()); builder.highlight_suffix("_".to_string()); - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); let format_options = FormatOptions { highlight: true, crop: None }; let text = "the do or die can't be he do and or isn't he"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); + let tokens: Vec<_> = text.tokenize().collect(); let mut matcher = builder.build(&tokens[..], text); assert_eq!( diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f3f852a48..62a7815b0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -6,12 +6,12 @@ use std::result::Result as StdResult; use std::str::Utf8Error; use std::time::Instant; +use charabia::TokenizerBuilder; use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use fst::automaton::Str; use fst::{Automaton, IntoStreamer, Streamer}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use log::debug; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; @@ -126,14 +126,14 @@ impl<'a> Search<'a> { builder.words_limit(self.words_limit); // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. - let mut config = AnalyzerConfig::default(); + let mut tokbuilder = TokenizerBuilder::new(); let stop_words = self.index.stop_words(self.rtxn)?; if let Some(ref stop_words) = stop_words { - config.stop_words(stop_words); + tokbuilder.stop_words(stop_words); } - let analyzer = Analyzer::new(config); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + + let tokenizer = tokbuilder.build(); + let tokens = tokenizer.tokenize(query); builder .build(tokens)? .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) diff --git a/milli/src/search/query_tree.rs b/milli/src/search/query_tree.rs index 76748179b..e0fac0f43 100644 --- a/milli/src/search/query_tree.rs +++ b/milli/src/search/query_tree.rs @@ -1,10 +1,9 @@ use std::borrow::Cow; use std::{cmp, fmt, mem}; +use charabia::classifier::ClassifiedTokenIter; +use charabia::{SeparatorKind, TokenKind}; use fst::Set; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::tokenizer::TokenStream; -use meilisearch_tokenizer::TokenKind; use roaring::RoaringBitmap; use slice_group_by::GroupBy; @@ -235,9 +234,9 @@ impl<'a> QueryTreeBuilder<'a> { /// - if `authorize_typos` is set to `false` the query tree will be generated /// forcing all query words to match documents without any typo /// (the criterion `typo` will be ignored) - pub fn build( + pub fn build>( &self, - query: TokenStream, + query: ClassifiedTokenIter, ) -> Result> { let stop_words = self.index.stop_words(self.rtxn)?; let primitive_query = create_primitive_query(query, stop_words, self.words_limit); @@ -649,11 +648,14 @@ impl PrimitiveQueryPart { /// Create primitive query from tokenized query string, /// the primitive query is an intermediate state to build the query tree. -fn create_primitive_query( - query: TokenStream, +fn create_primitive_query( + query: ClassifiedTokenIter, stop_words: Option>, words_limit: Option, -) -> PrimitiveQuery { +) -> PrimitiveQuery +where + A: AsRef<[u8]>, +{ let mut primitive_query = Vec::new(); let mut phrase = Vec::new(); let mut quoted = false; @@ -673,21 +675,18 @@ fn create_primitive_query( // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 3. if the word is the last token of the query we push it as a prefix word. if quoted { - phrase.push(token.word.to_string()); + phrase.push(token.lemma().to_string()); } else if peekable.peek().is_some() { - if !stop_words - .as_ref() - .map_or(false, |swords| swords.contains(token.word.as_ref())) - { + if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) { primitive_query - .push(PrimitiveQueryPart::Word(token.word.to_string(), false)); + .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false)); } } else { - primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); + primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true)); } } TokenKind::Separator(separator_kind) => { - let quote_count = token.word.chars().filter(|&s| s == '"').count(); + let quote_count = token.lemma().chars().filter(|&s| s == '"').count(); // swap quoted state if we encounter a double quote if quote_count % 2 != 0 { quoted = !quoted; @@ -738,8 +737,8 @@ pub fn maximum_proximity(operation: &Operation) -> usize { mod test { use std::collections::HashMap; + use charabia::Tokenize; use maplit::hashmap; - use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; @@ -754,12 +753,12 @@ mod test { } impl TestContext { - fn build( + fn build>( &self, optional_words: bool, authorize_typos: bool, words_limit: Option, - query: TokenStream, + query: ClassifiedTokenIter, ) -> Result> { let primitive_query = create_primitive_query(query, None, words_limit); if !primitive_query.is_empty() { @@ -856,9 +855,7 @@ mod test { #[test] fn prefix() { let query = "hey friends"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -889,9 +886,7 @@ mod test { #[test] fn no_prefix() { let query = "hey friends "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -922,9 +917,7 @@ mod test { #[test] fn synonyms() { let query = "hello world "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -987,9 +980,7 @@ mod test { #[test] fn complex_synonyms() { let query = "new york city "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1087,9 +1078,7 @@ mod test { #[test] fn ngrams() { let query = "n grams "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1120,9 +1109,7 @@ mod test { #[test] fn word_split() { let query = "wordsplit fish "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1159,9 +1146,7 @@ mod test { #[test] fn phrase() { let query = "\"hey friends\" \" \" \"wooop"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::And(vec![ Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), @@ -1177,9 +1162,7 @@ mod test { #[test] fn phrase_with_hard_separator() { let query = "\"hey friends. wooop wooop\""; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::And(vec![ Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), @@ -1195,9 +1178,7 @@ mod test { #[test] fn optional_word() { let query = "hey my friend "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( true, @@ -1280,9 +1261,7 @@ mod test { #[test] fn optional_word_phrase() { let query = "\"hey my\""; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); let (query_tree, _) = @@ -1294,9 +1273,7 @@ mod test { #[test] fn optional_word_multiple_phrases() { let query = r#""hey" my good "friend""#; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( true, @@ -1365,9 +1342,7 @@ mod test { #[test] fn no_typo() { let query = "hey friends "; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::Or( false, @@ -1397,9 +1372,7 @@ mod test { #[test] fn words_limit() { let query = "\"hey my\" good friend"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); - let tokens = result.tokens(); + let tokens = query.tokenize(); let expected = Operation::And(vec![ Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), @@ -1441,10 +1414,8 @@ mod test { #[test] fn disable_typo_on_word() { let query = "goodbye"; - let analyzer = Analyzer::new(AnalyzerConfig::>::default()); - let result = analyzer.analyze(query); + let tokens = query.tokenize(); - let tokens = result.tokens(); let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); let context = TestContext { exact_words, ..Default::default() }; diff --git a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs index 44bf9dbf7..9a6060805 100644 --- a/milli/src/update/index_documents/extract/extract_docid_word_positions.rs +++ b/milli/src/update/index_documents/extract/extract_docid_word_positions.rs @@ -3,8 +3,7 @@ use std::convert::TryInto; use std::fs::File; use std::{io, mem, str}; -use meilisearch_tokenizer::token::SeparatorKind; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind}; +use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder}; use roaring::RoaringBitmap; use serde_json::Value; @@ -40,11 +39,11 @@ pub fn extract_docid_word_positions( let mut key_buffer = Vec::new(); let mut field_buffer = String::new(); - let mut config = AnalyzerConfig::default(); + let mut builder = TokenizerBuilder::new(); if let Some(stop_words) = stop_words { - config.stop_words(stop_words); + builder.stop_words(stop_words); } - let analyzer = Analyzer::>::new(AnalyzerConfig::default()); + let tokenizer = builder.build(); let mut cursor = obkv_documents.into_cursor()?; while let Some((key, value)) = cursor.move_on_next()? { @@ -64,12 +63,11 @@ pub fn extract_docid_word_positions( serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; field_buffer.clear(); if let Some(field) = json_to_string(&value, &mut field_buffer) { - let analyzed = analyzer.analyze(field); - let tokens = process_tokens(analyzed.tokens()) + let tokens = process_tokens(tokenizer.tokenize(field)) .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); for (index, token) in tokens { - let token = token.text().trim(); + let token = token.lemma().trim(); if !token.is_empty() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(token.as_bytes()); @@ -146,7 +144,7 @@ fn process_tokens<'a>( tokens: impl Iterator>, ) -> impl Iterator)> { tokens - .skip_while(|token| token.is_separator().is_some()) + .skip_while(|token| token.is_separator()) .scan((0, None), |(offset, prev_kind), token| { match token.kind { TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index 829932d5c..9363d8eb6 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -1,8 +1,8 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use std::result::Result as StdResult; +use charabia::{Tokenizer, TokenizerBuilder}; use itertools::Itertools; -use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use serde::{Deserialize, Deserializer, Serialize, Serializer}; use time::OffsetDateTime; @@ -385,13 +385,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_synonyms(&mut self) -> Result { match self.synonyms { Setting::Set(ref synonyms) => { - fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec { - analyzer - .analyze(text) - .tokens() + fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec { + tokenizer + .tokenize(text) .filter_map(|token| { if token.is_word() { - Some(token.text().to_string()) + Some(token.lemma().to_string()) } else { None } @@ -399,19 +398,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { .collect::>() } - let mut config = AnalyzerConfig::default(); + let mut builder = TokenizerBuilder::new(); let stop_words = self.index.stop_words(self.wtxn)?; - if let Some(stop_words) = &stop_words { - config.stop_words(stop_words); + if let Some(ref stop_words) = stop_words { + builder.stop_words(stop_words); } - let analyzer = Analyzer::new(config); + let tokenizer = builder.build(); let mut new_synonyms = HashMap::new(); for (word, synonyms) in synonyms { // Normalize both the word and associated synonyms. - let normalized_word = normalize(&analyzer, word); + let normalized_word = normalize(&tokenizer, word); let normalized_synonyms = - synonyms.iter().map(|synonym| normalize(&analyzer, synonym)); + synonyms.iter().map(|synonym| normalize(&tokenizer, synonym)); // Store the normalized synonyms under the normalized word, // merging the possible duplicate words. @@ -584,19 +583,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { fn update_exact_words(&mut self) -> Result<()> { match self.exact_words { Setting::Set(ref mut words) => { - fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String { - analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect() + fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String { + tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect() } - let mut config = AnalyzerConfig::default(); + let mut builder = TokenizerBuilder::new(); let stop_words = self.index.stop_words(self.wtxn)?; - if let Some(stop_words) = &stop_words { - config.stop_words(stop_words); + if let Some(ref stop_words) = stop_words { + builder.stop_words(stop_words); } - let analyzer = Analyzer::new(config); + let tokenizer = builder.build(); let mut words: Vec<_> = - words.iter().map(|word| normalize(&analyzer, word)).collect(); + words.iter().map(|word| normalize(&tokenizer, word)).collect(); // normalization could reorder words words.sort_unstable();