540: Integrate charabia r=Kerollmops a=ManyTheFish

related to https://github.com/meilisearch/meilisearch/issues/2375
related to https://github.com/meilisearch/meilisearch/issues/2144
related to https://github.com/meilisearch/meilisearch/issues/2417

Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
bors[bot] 2022-06-02 15:34:33 +00:00 committed by GitHub
commit dd186533f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 142 additions and 221 deletions

View File

@ -1,5 +1,5 @@
use criterion::{criterion_group, criterion_main}; use criterion::{criterion_group, criterion_main};
use milli::tokenizer::{Analyzer, AnalyzerConfig}; use milli::tokenizer::Tokenize;
use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords}; use milli::{FormatOptions, MatcherBuilder, MatchingWord, MatchingWords};
#[cfg(target_os = "linux")] #[cfg(target_os = "linux")]
@ -52,9 +52,7 @@ fn bench_formatting(c: &mut criterion::Criterion) {
for conf in confs { for conf in confs {
group.bench_function(conf.name, |b| { group.bench_function(conf.name, |b| {
b.iter(|| { b.iter(|| {
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens: Vec<_> = conf.text.tokenize().collect();
let analyzed = analyzer.analyze(&conf.text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = conf.matching_words.build(&tokens[..], conf.text); let mut matcher = conf.matching_words.build(&tokens[..], conf.text);
matcher.format(option.clone()); matcher.format(option.clone());
}) })

View File

@ -19,7 +19,7 @@ use flate2::read::GzDecoder;
use futures::{stream, FutureExt, StreamExt}; use futures::{stream, FutureExt, StreamExt};
use heed::EnvOpenOptions; use heed::EnvOpenOptions;
use milli::documents::DocumentBatchReader; use milli::documents::DocumentBatchReader;
use milli::tokenizer::{Analyzer, AnalyzerConfig}; use milli::tokenizer::{Tokenizer, TokenizerBuilder};
use milli::update::UpdateIndexingStep::*; use milli::update::UpdateIndexingStep::*;
use milli::update::{ use milli::update::{
ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting, ClearDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Setting,
@ -139,17 +139,16 @@ pub struct IndexerOpt {
pub max_positions_per_attributes: Option<u32>, pub max_positions_per_attributes: Option<u32>,
} }
struct Highlighter<'a, A> { struct Highlighter<'s, A> {
analyzer: Analyzer<'a, A>, tokenizer: Tokenizer<'s, A>,
} }
impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> { impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
fn new(stop_words: &'a fst::Set<A>) -> Self { fn new(stop_words: &'s fst::Set<A>) -> Self {
let mut config = AnalyzerConfig::default(); let mut builder = TokenizerBuilder::new();
config.stop_words(stop_words); builder.stop_words(stop_words);
let analyzer = Analyzer::new(config);
Self { analyzer } Self { tokenizer: builder.build() }
} }
fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value { fn highlight_value(&self, value: Value, matcher_builder: &MatcherBuilder) -> Value {
@ -158,9 +157,8 @@ impl<'a, A: AsRef<[u8]>> Highlighter<'a, A> {
Value::Bool(boolean) => Value::Bool(boolean), Value::Bool(boolean) => Value::Bool(boolean),
Value::Number(number) => Value::Number(number), Value::Number(number) => Value::Number(number),
Value::String(old_string) => { Value::String(old_string) => {
let analyzed = self.analyzer.analyze(&old_string); let tokens: Vec<_> = self.tokenizer.tokenize(&old_string).collect();
let analyzed: Vec<_> = analyzed.tokens().collect(); let mut matcher = matcher_builder.build(&tokens[..], &old_string);
let mut matcher = matcher_builder.build(&analyzed[..], &old_string);
let format_options = FormatOptions { highlight: true, crop: Some(10) }; let format_options = FormatOptions { highlight: true, crop: Some(10) };

View File

@ -9,18 +9,18 @@ bimap = { version = "0.6.2", features = ["serde"] }
bincode = "1.3.3" bincode = "1.3.3"
bstr = "0.2.17" bstr = "0.2.17"
byteorder = "1.4.3" byteorder = "1.4.3"
charabia = "0.5.0"
concat-arrays = "0.1.2" concat-arrays = "0.1.2"
crossbeam-channel = "0.5.2" crossbeam-channel = "0.5.2"
either = "1.6.1" either = "1.6.1"
flatten-serde-json = { path = "../flatten-serde-json" }
fst = "0.4.7" fst = "0.4.7"
fxhash = "0.2.1" fxhash = "0.2.1"
flatten-serde-json = { path = "../flatten-serde-json" }
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
geoutils = "0.4.1" geoutils = "0.4.1"
grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
json-depth-checker = { path = "../json-depth-checker" } json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
meilisearch-tokenizer = { git = "https://github.com/meilisearch/tokenizer.git", tag = "v0.2.9" }
memmap2 = "0.5.3" memmap2 = "0.5.3"
obkv = "0.2.0" obkv = "0.2.0"
once_cell = "1.10.0" once_cell = "1.10.0"

View File

@ -21,7 +21,7 @@ pub use filter_parser::{Condition, FilterCondition};
use fxhash::{FxHasher32, FxHasher64}; use fxhash::{FxHasher32, FxHasher64};
pub use grenad::CompressionType; pub use grenad::CompressionType;
use serde_json::{Map, Value}; use serde_json::{Map, Value};
pub use {heed, meilisearch_tokenizer as tokenizer}; pub use {charabia as tokenizer, heed};
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
pub use self::criterion::{default_criteria, Criterion, CriterionError}; pub use self::criterion::{default_criteria, Criterion, CriterionError};

View File

@ -3,8 +3,8 @@ use std::collections::BTreeMap;
use std::fmt; use std::fmt;
use std::ops::{Index, IndexMut}; use std::ops::{Index, IndexMut};
use charabia::Token;
use levenshtein_automata::{Distance, DFA}; use levenshtein_automata::{Distance, DFA};
use meilisearch_tokenizer::Token;
use crate::search::build_dfa; use crate::search::build_dfa;
@ -99,13 +99,13 @@ impl MatchingWord {
/// Returns the lenght in chars of the match in case of the token matches the term. /// Returns the lenght in chars of the match in case of the token matches the term.
pub fn match_token(&self, token: &Token) -> Option<usize> { pub fn match_token(&self, token: &Token) -> Option<usize> {
match self.dfa.eval(token.text()) { match self.dfa.eval(token.lemma()) {
Distance::Exact(t) if t <= self.typo => { Distance::Exact(t) if t <= self.typo => {
if self.prefix { if self.prefix {
let len = bytes_to_highlight(token.text(), &self.word); let len = bytes_to_highlight(token.lemma(), &self.word);
Some(token.num_chars_from_bytes(len)) Some(token.original_lengths(len).0)
} else { } else {
Some(token.num_chars_from_bytes(token.text().len())) Some(token.original_lengths(token.lemma().len()).0)
} }
} }
_otherwise => None, _otherwise => None,
@ -262,7 +262,7 @@ mod tests {
use std::borrow::Cow; use std::borrow::Cow;
use std::str::from_utf8; use std::str::from_utf8;
use meilisearch_tokenizer::TokenKind; use charabia::TokenKind;
use super::*; use super::*;
use crate::MatchingWords; use crate::MatchingWords;
@ -344,11 +344,10 @@ mod tests {
matching_words matching_words
.match_token(&Token { .match_token(&Token {
kind: TokenKind::Word, kind: TokenKind::Word,
word: Cow::Borrowed("word"), lemma: Cow::Borrowed("word"),
byte_start: 0, char_end: "word".chars().count(),
char_index: 0,
byte_end: "word".len(), byte_end: "word".len(),
char_map: None, ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_len: 3, ids: &[2] }) Some(MatchType::Full { char_len: 3, ids: &[2] })
@ -357,11 +356,10 @@ mod tests {
matching_words matching_words
.match_token(&Token { .match_token(&Token {
kind: TokenKind::Word, kind: TokenKind::Word,
word: Cow::Borrowed("nyc"), lemma: Cow::Borrowed("nyc"),
byte_start: 0, char_end: "nyc".chars().count(),
char_index: 0,
byte_end: "nyc".len(), byte_end: "nyc".len(),
char_map: None, ..Default::default()
}) })
.next(), .next(),
None None
@ -370,11 +368,10 @@ mod tests {
matching_words matching_words
.match_token(&Token { .match_token(&Token {
kind: TokenKind::Word, kind: TokenKind::Word,
word: Cow::Borrowed("world"), lemma: Cow::Borrowed("world"),
byte_start: 0, char_end: "world".chars().count(),
char_index: 0,
byte_end: "world".len(), byte_end: "world".len(),
char_map: None, ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_len: 5, ids: &[2] }) Some(MatchType::Full { char_len: 5, ids: &[2] })
@ -383,11 +380,10 @@ mod tests {
matching_words matching_words
.match_token(&Token { .match_token(&Token {
kind: TokenKind::Word, kind: TokenKind::Word,
word: Cow::Borrowed("splitted"), lemma: Cow::Borrowed("splitted"),
byte_start: 0, char_end: "splitted".chars().count(),
char_index: 0,
byte_end: "splitted".len(), byte_end: "splitted".len(),
char_map: None, ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_len: 5, ids: &[0] }) Some(MatchType::Full { char_len: 5, ids: &[0] })
@ -396,11 +392,10 @@ mod tests {
matching_words matching_words
.match_token(&Token { .match_token(&Token {
kind: TokenKind::Word, kind: TokenKind::Word,
word: Cow::Borrowed("thisnew"), lemma: Cow::Borrowed("thisnew"),
byte_start: 0, char_end: "thisnew".chars().count(),
char_index: 0,
byte_end: "thisnew".len(), byte_end: "thisnew".len(),
char_map: None, ..Default::default()
}) })
.next(), .next(),
None None
@ -409,11 +404,10 @@ mod tests {
matching_words matching_words
.match_token(&Token { .match_token(&Token {
kind: TokenKind::Word, kind: TokenKind::Word,
word: Cow::Borrowed("borld"), lemma: Cow::Borrowed("borld"),
byte_start: 0, char_end: "borld".chars().count(),
char_index: 0,
byte_end: "borld".len(), byte_end: "borld".len(),
char_map: None, ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_len: 5, ids: &[2] }) Some(MatchType::Full { char_len: 5, ids: &[2] })
@ -422,11 +416,10 @@ mod tests {
matching_words matching_words
.match_token(&Token { .match_token(&Token {
kind: TokenKind::Word, kind: TokenKind::Word,
word: Cow::Borrowed("wordsplit"), lemma: Cow::Borrowed("wordsplit"),
byte_start: 0, char_end: "wordsplit".chars().count(),
char_index: 0,
byte_end: "wordsplit".len(), byte_end: "wordsplit".len(),
char_map: None, ..Default::default()
}) })
.next(), .next(),
Some(MatchType::Full { char_len: 4, ids: &[2] }) Some(MatchType::Full { char_len: 4, ids: &[2] })

View File

@ -1,8 +1,8 @@
use std::borrow::Cow; use std::borrow::Cow;
use charabia::{SeparatorKind, Token};
use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; use matching_words::{MatchType, PartialMatch, PrimitiveWordId};
pub use matching_words::{MatchingWord, MatchingWords}; pub use matching_words::{MatchingWord, MatchingWords};
use meilisearch_tokenizer::token::{SeparatorKind, Token};
use serde::Serialize; use serde::Serialize;
pub mod matching_words; pub mod matching_words;
@ -168,13 +168,13 @@ impl<'t> Matcher<'t, '_> {
let current_token_position = *token_position; let current_token_position = *token_position;
let current_word_position = *word_position; let current_word_position = *word_position;
*token_position += 1; *token_position += 1;
if token.is_separator().is_none() { if !token.is_separator() {
*word_position += 1; *word_position += 1;
} }
Some((current_token_position, current_word_position, token)) Some((current_token_position, current_word_position, token))
}) })
.filter(|(_, _, token)| token.is_separator().is_none()); .filter(|(_, _, token)| !token.is_separator());
while let Some((token_position, word_position, word)) = words_positions.next() { while let Some((token_position, word_position, word)) = words_positions.next() {
for match_type in self.matching_words.match_token(word) { for match_type in self.matching_words.match_token(word) {
@ -243,8 +243,8 @@ impl<'t> Matcher<'t, '_> {
let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable(); let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable();
while remaining_words > 0 { while remaining_words > 0 {
let before_token = before_tokens.peek().map(|t| t.is_separator()); let before_token = before_tokens.peek().map(|t| t.separator_kind());
let after_token = after_tokens.peek().map(|t| t.is_separator()); let after_token = after_tokens.peek().map(|t| t.separator_kind());
match (before_token, after_token) { match (before_token, after_token) {
// we can expand both sides. // we can expand both sides.
@ -470,7 +470,7 @@ impl<'t> Matcher<'t, '_> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use charabia::Tokenize;
use super::*; use super::*;
use crate::search::matches::matching_words::MatchingWord; use crate::search::matches::matching_words::MatchingWord;
@ -490,30 +490,26 @@ mod tests {
let matching_words = matching_words(); let matching_words = matching_words();
let builder = MatcherBuilder::from_matching_words(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let format_options = FormatOptions { highlight: false, crop: None }; let format_options = FormatOptions { highlight: false, crop: None };
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text. // no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text. // no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text. // no crop and no highlight should return complete text.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
@ -524,44 +520,38 @@ mod tests {
let matching_words = matching_words(); let matching_words = matching_words();
let builder = MatcherBuilder::from_matching_words(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let format_options = FormatOptions { highlight: true, crop: None }; let format_options = FormatOptions { highlight: true, crop: None };
// empty text. // empty text.
let text = ""; let text = "";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ""); assert_eq!(&matcher.format(format_options), "");
// text containing only separators. // text containing only separators.
let text = ":-)"; let text = ":-)";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ":-)"); assert_eq!(&matcher.format(format_options), ":-)");
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text, because there is no matches. // no crop should return complete text, because there is no matches.
assert_eq!(&matcher.format(format_options), &text); assert_eq!(&matcher.format(format_options), &text);
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!( assert_eq!(
@ -580,30 +570,26 @@ mod tests {
let matching_words = MatchingWords::new(matching_words); let matching_words = MatchingWords::new(matching_words);
let builder = MatcherBuilder::from_matching_words(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let format_options = FormatOptions { highlight: true, crop: None }; let format_options = FormatOptions { highlight: true, crop: None };
// Text containing prefix match. // Text containing prefix match.
let text = "Ŵôřlḑôle"; let text = "Ŵôřlḑôle";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle"); assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle");
// Text containing unicode match. // Text containing unicode match.
let text = "Ŵôřlḑ"; let text = "Ŵôřlḑ";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>"); assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>");
// Text containing unicode match. // Text containing unicode match.
let text = "Westfália"; let text = "Westfália";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches. // no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a"); assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a");
@ -614,28 +600,24 @@ mod tests {
let matching_words = matching_words(); let matching_words = matching_words();
let builder = MatcherBuilder::from_matching_words(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let format_options = FormatOptions { highlight: false, crop: Some(10) }; let format_options = FormatOptions { highlight: false, crop: Some(10) };
// empty text. // empty text.
let text = ""; let text = "";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ""); assert_eq!(&matcher.format(format_options), "");
// text containing only separators. // text containing only separators.
let text = ":-)"; let text = ":-)";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ":-)"); assert_eq!(&matcher.format(format_options), ":-)");
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 first words with a marker at the end. // no highlight should return 10 first words with a marker at the end.
assert_eq!( assert_eq!(
@ -645,8 +627,7 @@ mod tests {
// Text without any match starting by a separator. // Text without any match starting by a separator.
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 first words with a marker at the end. // no highlight should return 10 first words with a marker at the end.
assert_eq!( assert_eq!(
@ -656,19 +637,17 @@ mod tests {
// Test phrase propagation // Test phrase propagation
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// should crop the phrase instead of croping around the match. // should crop the phrase instead of croping around the match.
assert_eq!( assert_eq!(
&matcher.format(format_options), &matcher.format(format_options),
"Split The World is a book written by Emily Henry…" " Split The World is a book written by Emily Henry…",
); );
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start. // no highlight should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
@ -678,8 +657,7 @@ mod tests {
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start. // no highlight should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
@ -689,8 +667,7 @@ mod tests {
// Text containing a match unordered and a match ordered. // Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void"; let text = "The world split void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
@ -700,8 +677,7 @@ mod tests {
// Text containing matches with diferent density. // Text containing matches with diferent density.
let text = "split void the void void world void void void void void void void void void void split the world void void"; let text = "split void the void void world void void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
@ -711,8 +687,7 @@ mod tests {
// Text containing matches with same word. // Text containing matches with same word.
let text = "split split split split split split void void void void void void void void void void split the world void void"; let text = "split split split split split split void void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
@ -726,28 +701,24 @@ mod tests {
let matching_words = matching_words(); let matching_words = matching_words();
let builder = MatcherBuilder::from_matching_words(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let format_options = FormatOptions { highlight: true, crop: Some(10) }; let format_options = FormatOptions { highlight: true, crop: Some(10) };
// empty text. // empty text.
let text = ""; let text = "";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ""); assert_eq!(&matcher.format(format_options), "");
// text containing only separators. // text containing only separators.
let text = ":-)"; let text = ":-)";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(format_options), ":-)"); assert_eq!(&matcher.format(format_options), ":-)");
// Text without any match. // Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// both should return 10 first words with a marker at the end. // both should return 10 first words with a marker at the end.
assert_eq!( assert_eq!(
@ -757,8 +728,7 @@ mod tests {
// Text containing some matches. // Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves."; let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches. // both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!( assert_eq!(
@ -768,16 +738,14 @@ mod tests {
// Text containing all matches. // Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches. // both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."); assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
// Text containing a match unordered and a match ordered. // Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void"; let text = "The world split void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start. // crop should return 10 last words with a marker at the start.
assert_eq!( assert_eq!(
@ -792,11 +760,9 @@ mod tests {
let matching_words = matching_words(); let matching_words = matching_words();
let builder = MatcherBuilder::from_matching_words(matching_words); let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let text = "void void split the world void void."; let text = "void void split the world void void.";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
// set a smaller crop size // set a smaller crop size
let format_options = FormatOptions { highlight: false, crop: Some(2) }; let format_options = FormatOptions { highlight: false, crop: Some(2) };
@ -847,13 +813,11 @@ mod tests {
let mut builder = MatcherBuilder::from_matching_words(matching_words); let mut builder = MatcherBuilder::from_matching_words(matching_words);
builder.highlight_prefix("_".to_string()); builder.highlight_prefix("_".to_string());
builder.highlight_suffix("_".to_string()); builder.highlight_suffix("_".to_string());
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let format_options = FormatOptions { highlight: true, crop: None }; let format_options = FormatOptions { highlight: true, crop: None };
let text = "the do or die can't be he do and or isn't he"; let text = "the do or die can't be he do and or isn't he";
let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = text.tokenize().collect();
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text); let mut matcher = builder.build(&tokens[..], text);
assert_eq!( assert_eq!(

View File

@ -6,12 +6,12 @@ use std::result::Result as StdResult;
use std::str::Utf8Error; use std::str::Utf8Error;
use std::time::Instant; use std::time::Instant;
use charabia::TokenizerBuilder;
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct}; use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
use fst::automaton::Str; use fst::automaton::Str;
use fst::{Automaton, IntoStreamer, Streamer}; use fst::{Automaton, IntoStreamer, Streamer};
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA}; use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
use log::debug; use log::debug;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use roaring::bitmap::RoaringBitmap; use roaring::bitmap::RoaringBitmap;
@ -126,14 +126,14 @@ impl<'a> Search<'a> {
builder.words_limit(self.words_limit); builder.words_limit(self.words_limit);
// We make sure that the analyzer is aware of the stop words // We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them. // this ensures that the query builder is able to properly remove them.
let mut config = AnalyzerConfig::default(); let mut tokbuilder = TokenizerBuilder::new();
let stop_words = self.index.stop_words(self.rtxn)?; let stop_words = self.index.stop_words(self.rtxn)?;
if let Some(ref stop_words) = stop_words { if let Some(ref stop_words) = stop_words {
config.stop_words(stop_words); tokbuilder.stop_words(stop_words);
} }
let analyzer = Analyzer::new(config);
let result = analyzer.analyze(query); let tokenizer = tokbuilder.build();
let tokens = result.tokens(); let tokens = tokenizer.tokenize(query);
builder builder
.build(tokens)? .build(tokens)?
.map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw))) .map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw)))

View File

@ -1,10 +1,9 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::{cmp, fmt, mem}; use std::{cmp, fmt, mem};
use charabia::classifier::ClassifiedTokenIter;
use charabia::{SeparatorKind, TokenKind};
use fst::Set; use fst::Set;
use meilisearch_tokenizer::token::SeparatorKind;
use meilisearch_tokenizer::tokenizer::TokenStream;
use meilisearch_tokenizer::TokenKind;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
@ -235,9 +234,9 @@ impl<'a> QueryTreeBuilder<'a> {
/// - if `authorize_typos` is set to `false` the query tree will be generated /// - if `authorize_typos` is set to `false` the query tree will be generated
/// forcing all query words to match documents without any typo /// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored) /// (the criterion `typo` will be ignored)
pub fn build( pub fn build<A: AsRef<[u8]>>(
&self, &self,
query: TokenStream, query: ClassifiedTokenIter<A>,
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> { ) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
let stop_words = self.index.stop_words(self.rtxn)?; let stop_words = self.index.stop_words(self.rtxn)?;
let primitive_query = create_primitive_query(query, stop_words, self.words_limit); let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
@ -649,11 +648,14 @@ impl PrimitiveQueryPart {
/// Create primitive query from tokenized query string, /// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree. /// the primitive query is an intermediate state to build the query tree.
fn create_primitive_query( fn create_primitive_query<A>(
query: TokenStream, query: ClassifiedTokenIter<A>,
stop_words: Option<Set<&[u8]>>, stop_words: Option<Set<&[u8]>>,
words_limit: Option<usize>, words_limit: Option<usize>,
) -> PrimitiveQuery { ) -> PrimitiveQuery
where
A: AsRef<[u8]>,
{
let mut primitive_query = Vec::new(); let mut primitive_query = Vec::new();
let mut phrase = Vec::new(); let mut phrase = Vec::new();
let mut quoted = false; let mut quoted = false;
@ -673,21 +675,18 @@ fn create_primitive_query(
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
// 3. if the word is the last token of the query we push it as a prefix word. // 3. if the word is the last token of the query we push it as a prefix word.
if quoted { if quoted {
phrase.push(token.word.to_string()); phrase.push(token.lemma().to_string());
} else if peekable.peek().is_some() { } else if peekable.peek().is_some() {
if !stop_words if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
.as_ref()
.map_or(false, |swords| swords.contains(token.word.as_ref()))
{
primitive_query primitive_query
.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); .push(PrimitiveQueryPart::Word(token.lemma().to_string(), false));
} }
} else { } else {
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true));
} }
} }
TokenKind::Separator(separator_kind) => { TokenKind::Separator(separator_kind) => {
let quote_count = token.word.chars().filter(|&s| s == '"').count(); let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
// swap quoted state if we encounter a double quote // swap quoted state if we encounter a double quote
if quote_count % 2 != 0 { if quote_count % 2 != 0 {
quoted = !quoted; quoted = !quoted;
@ -738,8 +737,8 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
mod test { mod test {
use std::collections::HashMap; use std::collections::HashMap;
use charabia::Tokenize;
use maplit::hashmap; use maplit::hashmap;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use rand::rngs::StdRng; use rand::rngs::StdRng;
use rand::{Rng, SeedableRng}; use rand::{Rng, SeedableRng};
@ -754,12 +753,12 @@ mod test {
} }
impl TestContext { impl TestContext {
fn build( fn build<A: AsRef<[u8]>>(
&self, &self,
optional_words: bool, optional_words: bool,
authorize_typos: bool, authorize_typos: bool,
words_limit: Option<usize>, words_limit: Option<usize>,
query: TokenStream, query: ClassifiedTokenIter<A>,
) -> Result<Option<(Operation, PrimitiveQuery)>> { ) -> Result<Option<(Operation, PrimitiveQuery)>> {
let primitive_query = create_primitive_query(query, None, words_limit); let primitive_query = create_primitive_query(query, None, words_limit);
if !primitive_query.is_empty() { if !primitive_query.is_empty() {
@ -856,9 +855,7 @@ mod test {
#[test] #[test]
fn prefix() { fn prefix() {
let query = "hey friends"; let query = "hey friends";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
false, false,
@ -889,9 +886,7 @@ mod test {
#[test] #[test]
fn no_prefix() { fn no_prefix() {
let query = "hey friends "; let query = "hey friends ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
false, false,
@ -922,9 +917,7 @@ mod test {
#[test] #[test]
fn synonyms() { fn synonyms() {
let query = "hello world "; let query = "hello world ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
false, false,
@ -987,9 +980,7 @@ mod test {
#[test] #[test]
fn complex_synonyms() { fn complex_synonyms() {
let query = "new york city "; let query = "new york city ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
false, false,
@ -1087,9 +1078,7 @@ mod test {
#[test] #[test]
fn ngrams() { fn ngrams() {
let query = "n grams "; let query = "n grams ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
false, false,
@ -1120,9 +1109,7 @@ mod test {
#[test] #[test]
fn word_split() { fn word_split() {
let query = "wordsplit fish "; let query = "wordsplit fish ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
false, false,
@ -1159,9 +1146,7 @@ mod test {
#[test] #[test]
fn phrase() { fn phrase() {
let query = "\"hey friends\" \" \" \"wooop"; let query = "\"hey friends\" \" \" \"wooop";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
@ -1177,9 +1162,7 @@ mod test {
#[test] #[test]
fn phrase_with_hard_separator() { fn phrase_with_hard_separator() {
let query = "\"hey friends. wooop wooop\""; let query = "\"hey friends. wooop wooop\"";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]), Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
@ -1195,9 +1178,7 @@ mod test {
#[test] #[test]
fn optional_word() { fn optional_word() {
let query = "hey my friend "; let query = "hey my friend ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
true, true,
@ -1280,9 +1261,7 @@ mod test {
#[test] #[test]
fn optional_word_phrase() { fn optional_word_phrase() {
let query = "\"hey my\""; let query = "\"hey my\"";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]); let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]);
let (query_tree, _) = let (query_tree, _) =
@ -1294,9 +1273,7 @@ mod test {
#[test] #[test]
fn optional_word_multiple_phrases() { fn optional_word_multiple_phrases() {
let query = r#""hey" my good "friend""#; let query = r#""hey" my good "friend""#;
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
true, true,
@ -1365,9 +1342,7 @@ mod test {
#[test] #[test]
fn no_typo() { fn no_typo() {
let query = "hey friends "; let query = "hey friends ";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::Or( let expected = Operation::Or(
false, false,
@ -1397,9 +1372,7 @@ mod test {
#[test] #[test]
fn words_limit() { fn words_limit() {
let query = "\"hey my\" good friend"; let query = "\"hey my\" good friend";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let expected = Operation::And(vec![ let expected = Operation::And(vec![
Operation::Phrase(vec!["hey".to_string(), "my".to_string()]), Operation::Phrase(vec!["hey".to_string(), "my".to_string()]),
@ -1441,10 +1414,8 @@ mod test {
#[test] #[test]
fn disable_typo_on_word() { fn disable_typo_on_word() {
let query = "goodbye"; let query = "goodbye";
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default()); let tokens = query.tokenize();
let result = analyzer.analyze(query);
let tokens = result.tokens();
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner(); let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap()); let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
let context = TestContext { exact_words, ..Default::default() }; let context = TestContext { exact_words, ..Default::default() };

View File

@ -3,8 +3,7 @@ use std::convert::TryInto;
use std::fs::File; use std::fs::File;
use std::{io, mem, str}; use std::{io, mem, str};
use meilisearch_tokenizer::token::SeparatorKind; use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use serde_json::Value; use serde_json::Value;
@ -40,11 +39,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
let mut key_buffer = Vec::new(); let mut key_buffer = Vec::new();
let mut field_buffer = String::new(); let mut field_buffer = String::new();
let mut config = AnalyzerConfig::default(); let mut builder = TokenizerBuilder::new();
if let Some(stop_words) = stop_words { if let Some(stop_words) = stop_words {
config.stop_words(stop_words); builder.stop_words(stop_words);
} }
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default()); let tokenizer = builder.build();
let mut cursor = obkv_documents.into_cursor()?; let mut cursor = obkv_documents.into_cursor()?;
while let Some((key, value)) = cursor.move_on_next()? { while let Some((key, value)) = cursor.move_on_next()? {
@ -64,12 +63,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?; serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
field_buffer.clear(); field_buffer.clear();
if let Some(field) = json_to_string(&value, &mut field_buffer) { if let Some(field) = json_to_string(&value, &mut field_buffer) {
let analyzed = analyzer.analyze(field); let tokens = process_tokens(tokenizer.tokenize(field))
let tokens = process_tokens(analyzed.tokens())
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes); .take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
for (index, token) in tokens { for (index, token) in tokens {
let token = token.text().trim(); let token = token.lemma().trim();
if !token.is_empty() { if !token.is_empty() {
key_buffer.truncate(mem::size_of::<u32>()); key_buffer.truncate(mem::size_of::<u32>());
key_buffer.extend_from_slice(token.as_bytes()); key_buffer.extend_from_slice(token.as_bytes());
@ -146,7 +144,7 @@ fn process_tokens<'a>(
tokens: impl Iterator<Item = Token<'a>>, tokens: impl Iterator<Item = Token<'a>>,
) -> impl Iterator<Item = (usize, Token<'a>)> { ) -> impl Iterator<Item = (usize, Token<'a>)> {
tokens tokens
.skip_while(|token| token.is_separator().is_some()) .skip_while(|token| token.is_separator())
.scan((0, None), |(offset, prev_kind), token| { .scan((0, None), |(offset, prev_kind), token| {
match token.kind { match token.kind {
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => { TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {

View File

@ -1,8 +1,8 @@
use std::collections::{BTreeSet, HashMap, HashSet}; use std::collections::{BTreeSet, HashMap, HashSet};
use std::result::Result as StdResult; use std::result::Result as StdResult;
use charabia::{Tokenizer, TokenizerBuilder};
use itertools::Itertools; use itertools::Itertools;
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
use serde::{Deserialize, Deserializer, Serialize, Serializer}; use serde::{Deserialize, Deserializer, Serialize, Serializer};
use time::OffsetDateTime; use time::OffsetDateTime;
@ -385,13 +385,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn update_synonyms(&mut self) -> Result<bool> { fn update_synonyms(&mut self) -> Result<bool> {
match self.synonyms { match self.synonyms {
Setting::Set(ref synonyms) => { Setting::Set(ref synonyms) => {
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec<String> { fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
analyzer tokenizer
.analyze(text) .tokenize(text)
.tokens()
.filter_map(|token| { .filter_map(|token| {
if token.is_word() { if token.is_word() {
Some(token.text().to_string()) Some(token.lemma().to_string())
} else { } else {
None None
} }
@ -399,19 +398,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
.collect::<Vec<_>>() .collect::<Vec<_>>()
} }
let mut config = AnalyzerConfig::default(); let mut builder = TokenizerBuilder::new();
let stop_words = self.index.stop_words(self.wtxn)?; let stop_words = self.index.stop_words(self.wtxn)?;
if let Some(stop_words) = &stop_words { if let Some(ref stop_words) = stop_words {
config.stop_words(stop_words); builder.stop_words(stop_words);
} }
let analyzer = Analyzer::new(config); let tokenizer = builder.build();
let mut new_synonyms = HashMap::new(); let mut new_synonyms = HashMap::new();
for (word, synonyms) in synonyms { for (word, synonyms) in synonyms {
// Normalize both the word and associated synonyms. // Normalize both the word and associated synonyms.
let normalized_word = normalize(&analyzer, word); let normalized_word = normalize(&tokenizer, word);
let normalized_synonyms = let normalized_synonyms =
synonyms.iter().map(|synonym| normalize(&analyzer, synonym)); synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
// Store the normalized synonyms under the normalized word, // Store the normalized synonyms under the normalized word,
// merging the possible duplicate words. // merging the possible duplicate words.
@ -584,19 +583,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
fn update_exact_words(&mut self) -> Result<()> { fn update_exact_words(&mut self) -> Result<()> {
match self.exact_words { match self.exact_words {
Setting::Set(ref mut words) => { Setting::Set(ref mut words) => {
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String { fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect() tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
} }
let mut config = AnalyzerConfig::default(); let mut builder = TokenizerBuilder::new();
let stop_words = self.index.stop_words(self.wtxn)?; let stop_words = self.index.stop_words(self.wtxn)?;
if let Some(stop_words) = &stop_words { if let Some(ref stop_words) = stop_words {
config.stop_words(stop_words); builder.stop_words(stop_words);
} }
let analyzer = Analyzer::new(config); let tokenizer = builder.build();
let mut words: Vec<_> = let mut words: Vec<_> =
words.iter().map(|word| normalize(&analyzer, word)).collect(); words.iter().map(|word| normalize(&tokenizer, word)).collect();
// normalization could reorder words // normalization could reorder words
words.sort_unstable(); words.sort_unstable();