mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Use Charabia in milli
This commit is contained in:
parent
192e024ada
commit
86ac8568e6
@ -21,7 +21,7 @@ pub use filter_parser::{Condition, FilterCondition};
|
|||||||
use fxhash::{FxHasher32, FxHasher64};
|
use fxhash::{FxHasher32, FxHasher64};
|
||||||
pub use grenad::CompressionType;
|
pub use grenad::CompressionType;
|
||||||
use serde_json::{Map, Value};
|
use serde_json::{Map, Value};
|
||||||
pub use {heed, meilisearch_tokenizer as tokenizer};
|
pub use {charabia as tokenizer, heed};
|
||||||
|
|
||||||
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
|
||||||
pub use self::criterion::{default_criteria, Criterion, CriterionError};
|
pub use self::criterion::{default_criteria, Criterion, CriterionError};
|
||||||
|
@ -3,8 +3,8 @@ use std::collections::BTreeMap;
|
|||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::ops::{Index, IndexMut};
|
use std::ops::{Index, IndexMut};
|
||||||
|
|
||||||
|
use charabia::Token;
|
||||||
use levenshtein_automata::{Distance, DFA};
|
use levenshtein_automata::{Distance, DFA};
|
||||||
use meilisearch_tokenizer::Token;
|
|
||||||
|
|
||||||
use crate::search::build_dfa;
|
use crate::search::build_dfa;
|
||||||
|
|
||||||
@ -99,13 +99,13 @@ impl MatchingWord {
|
|||||||
|
|
||||||
/// Returns the lenght in chars of the match in case of the token matches the term.
|
/// Returns the lenght in chars of the match in case of the token matches the term.
|
||||||
pub fn match_token(&self, token: &Token) -> Option<usize> {
|
pub fn match_token(&self, token: &Token) -> Option<usize> {
|
||||||
match self.dfa.eval(token.text()) {
|
match self.dfa.eval(token.lemma()) {
|
||||||
Distance::Exact(t) if t <= self.typo => {
|
Distance::Exact(t) if t <= self.typo => {
|
||||||
if self.prefix {
|
if self.prefix {
|
||||||
let len = bytes_to_highlight(token.text(), &self.word);
|
let len = bytes_to_highlight(token.lemma(), &self.word);
|
||||||
Some(token.num_chars_from_bytes(len))
|
Some(token.original_lengths(len).0)
|
||||||
} else {
|
} else {
|
||||||
Some(token.num_chars_from_bytes(token.text().len()))
|
Some(token.original_lengths(token.lemma().len()).0)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
_otherwise => None,
|
_otherwise => None,
|
||||||
@ -262,7 +262,7 @@ mod tests {
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::str::from_utf8;
|
use std::str::from_utf8;
|
||||||
|
|
||||||
use meilisearch_tokenizer::TokenKind;
|
use charabia::TokenKind;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::MatchingWords;
|
use crate::MatchingWords;
|
||||||
@ -344,11 +344,10 @@ mod tests {
|
|||||||
matching_words
|
matching_words
|
||||||
.match_token(&Token {
|
.match_token(&Token {
|
||||||
kind: TokenKind::Word,
|
kind: TokenKind::Word,
|
||||||
word: Cow::Borrowed("word"),
|
lemma: Cow::Borrowed("word"),
|
||||||
byte_start: 0,
|
char_end: "word".chars().count(),
|
||||||
char_index: 0,
|
|
||||||
byte_end: "word".len(),
|
byte_end: "word".len(),
|
||||||
char_map: None,
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 3, ids: &[2] })
|
Some(MatchType::Full { char_len: 3, ids: &[2] })
|
||||||
@ -357,11 +356,10 @@ mod tests {
|
|||||||
matching_words
|
matching_words
|
||||||
.match_token(&Token {
|
.match_token(&Token {
|
||||||
kind: TokenKind::Word,
|
kind: TokenKind::Word,
|
||||||
word: Cow::Borrowed("nyc"),
|
lemma: Cow::Borrowed("nyc"),
|
||||||
byte_start: 0,
|
char_end: "nyc".chars().count(),
|
||||||
char_index: 0,
|
|
||||||
byte_end: "nyc".len(),
|
byte_end: "nyc".len(),
|
||||||
char_map: None,
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
None
|
None
|
||||||
@ -370,11 +368,10 @@ mod tests {
|
|||||||
matching_words
|
matching_words
|
||||||
.match_token(&Token {
|
.match_token(&Token {
|
||||||
kind: TokenKind::Word,
|
kind: TokenKind::Word,
|
||||||
word: Cow::Borrowed("world"),
|
lemma: Cow::Borrowed("world"),
|
||||||
byte_start: 0,
|
char_end: "world".chars().count(),
|
||||||
char_index: 0,
|
|
||||||
byte_end: "world".len(),
|
byte_end: "world".len(),
|
||||||
char_map: None,
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
||||||
@ -383,11 +380,10 @@ mod tests {
|
|||||||
matching_words
|
matching_words
|
||||||
.match_token(&Token {
|
.match_token(&Token {
|
||||||
kind: TokenKind::Word,
|
kind: TokenKind::Word,
|
||||||
word: Cow::Borrowed("splitted"),
|
lemma: Cow::Borrowed("splitted"),
|
||||||
byte_start: 0,
|
char_end: "splitted".chars().count(),
|
||||||
char_index: 0,
|
|
||||||
byte_end: "splitted".len(),
|
byte_end: "splitted".len(),
|
||||||
char_map: None,
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &[0] })
|
Some(MatchType::Full { char_len: 5, ids: &[0] })
|
||||||
@ -396,11 +392,10 @@ mod tests {
|
|||||||
matching_words
|
matching_words
|
||||||
.match_token(&Token {
|
.match_token(&Token {
|
||||||
kind: TokenKind::Word,
|
kind: TokenKind::Word,
|
||||||
word: Cow::Borrowed("thisnew"),
|
lemma: Cow::Borrowed("thisnew"),
|
||||||
byte_start: 0,
|
char_end: "thisnew".chars().count(),
|
||||||
char_index: 0,
|
|
||||||
byte_end: "thisnew".len(),
|
byte_end: "thisnew".len(),
|
||||||
char_map: None,
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
None
|
None
|
||||||
@ -409,11 +404,10 @@ mod tests {
|
|||||||
matching_words
|
matching_words
|
||||||
.match_token(&Token {
|
.match_token(&Token {
|
||||||
kind: TokenKind::Word,
|
kind: TokenKind::Word,
|
||||||
word: Cow::Borrowed("borld"),
|
lemma: Cow::Borrowed("borld"),
|
||||||
byte_start: 0,
|
char_end: "borld".chars().count(),
|
||||||
char_index: 0,
|
|
||||||
byte_end: "borld".len(),
|
byte_end: "borld".len(),
|
||||||
char_map: None,
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
Some(MatchType::Full { char_len: 5, ids: &[2] })
|
||||||
@ -422,11 +416,10 @@ mod tests {
|
|||||||
matching_words
|
matching_words
|
||||||
.match_token(&Token {
|
.match_token(&Token {
|
||||||
kind: TokenKind::Word,
|
kind: TokenKind::Word,
|
||||||
word: Cow::Borrowed("wordsplit"),
|
lemma: Cow::Borrowed("wordsplit"),
|
||||||
byte_start: 0,
|
char_end: "wordsplit".chars().count(),
|
||||||
char_index: 0,
|
|
||||||
byte_end: "wordsplit".len(),
|
byte_end: "wordsplit".len(),
|
||||||
char_map: None,
|
..Default::default()
|
||||||
})
|
})
|
||||||
.next(),
|
.next(),
|
||||||
Some(MatchType::Full { char_len: 4, ids: &[2] })
|
Some(MatchType::Full { char_len: 4, ids: &[2] })
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
|
||||||
|
use charabia::{SeparatorKind, Token};
|
||||||
use matching_words::{MatchType, PartialMatch, PrimitiveWordId};
|
use matching_words::{MatchType, PartialMatch, PrimitiveWordId};
|
||||||
pub use matching_words::{MatchingWord, MatchingWords};
|
pub use matching_words::{MatchingWord, MatchingWords};
|
||||||
use meilisearch_tokenizer::token::{SeparatorKind, Token};
|
|
||||||
use serde::Serialize;
|
use serde::Serialize;
|
||||||
|
|
||||||
pub mod matching_words;
|
pub mod matching_words;
|
||||||
@ -168,13 +168,13 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
let current_token_position = *token_position;
|
let current_token_position = *token_position;
|
||||||
let current_word_position = *word_position;
|
let current_word_position = *word_position;
|
||||||
*token_position += 1;
|
*token_position += 1;
|
||||||
if token.is_separator().is_none() {
|
if !token.is_separator() {
|
||||||
*word_position += 1;
|
*word_position += 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
Some((current_token_position, current_word_position, token))
|
Some((current_token_position, current_word_position, token))
|
||||||
})
|
})
|
||||||
.filter(|(_, _, token)| token.is_separator().is_none());
|
.filter(|(_, _, token)| !token.is_separator());
|
||||||
|
|
||||||
while let Some((token_position, word_position, word)) = words_positions.next() {
|
while let Some((token_position, word_position, word)) = words_positions.next() {
|
||||||
for match_type in self.matching_words.match_token(word) {
|
for match_type in self.matching_words.match_token(word) {
|
||||||
@ -243,8 +243,8 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable();
|
let mut after_tokens = self.tokens[last_match_token_position..].iter().peekable();
|
||||||
|
|
||||||
while remaining_words > 0 {
|
while remaining_words > 0 {
|
||||||
let before_token = before_tokens.peek().map(|t| t.is_separator());
|
let before_token = before_tokens.peek().map(|t| t.separator_kind());
|
||||||
let after_token = after_tokens.peek().map(|t| t.is_separator());
|
let after_token = after_tokens.peek().map(|t| t.separator_kind());
|
||||||
|
|
||||||
match (before_token, after_token) {
|
match (before_token, after_token) {
|
||||||
// we can expand both sides.
|
// we can expand both sides.
|
||||||
@ -470,7 +470,7 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
use charabia::Tokenize;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::search::matches::matching_words::MatchingWord;
|
use crate::search::matches::matching_words::MatchingWord;
|
||||||
@ -490,30 +490,26 @@ mod tests {
|
|||||||
let matching_words = matching_words();
|
let matching_words = matching_words();
|
||||||
|
|
||||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: false, crop: None };
|
let format_options = FormatOptions { highlight: false, crop: None };
|
||||||
|
|
||||||
// Text without any match.
|
// Text without any match.
|
||||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop and no highlight should return complete text.
|
// no crop and no highlight should return complete text.
|
||||||
assert_eq!(&matcher.format(format_options), &text);
|
assert_eq!(&matcher.format(format_options), &text);
|
||||||
|
|
||||||
// Text containing all matches.
|
// Text containing all matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop and no highlight should return complete text.
|
// no crop and no highlight should return complete text.
|
||||||
assert_eq!(&matcher.format(format_options), &text);
|
assert_eq!(&matcher.format(format_options), &text);
|
||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop and no highlight should return complete text.
|
// no crop and no highlight should return complete text.
|
||||||
assert_eq!(&matcher.format(format_options), &text);
|
assert_eq!(&matcher.format(format_options), &text);
|
||||||
@ -524,44 +520,38 @@ mod tests {
|
|||||||
let matching_words = matching_words();
|
let matching_words = matching_words();
|
||||||
|
|
||||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: true, crop: None };
|
let format_options = FormatOptions { highlight: true, crop: None };
|
||||||
|
|
||||||
// empty text.
|
// empty text.
|
||||||
let text = "";
|
let text = "";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
assert_eq!(&matcher.format(format_options), "");
|
assert_eq!(&matcher.format(format_options), "");
|
||||||
|
|
||||||
// text containing only separators.
|
// text containing only separators.
|
||||||
let text = ":-)";
|
let text = ":-)";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
assert_eq!(&matcher.format(format_options), ":-)");
|
assert_eq!(&matcher.format(format_options), ":-)");
|
||||||
|
|
||||||
// Text without any match.
|
// Text without any match.
|
||||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop should return complete text, because there is no matches.
|
// no crop should return complete text, because there is no matches.
|
||||||
assert_eq!(&matcher.format(format_options), &text);
|
assert_eq!(&matcher.format(format_options), &text);
|
||||||
|
|
||||||
// Text containing all matches.
|
// Text containing all matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop should return complete text with highlighted matches.
|
// no crop should return complete text with highlighted matches.
|
||||||
assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
|
assert_eq!(&matcher.format(format_options), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
|
||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop should return complete text with highlighted matches.
|
// no crop should return complete text with highlighted matches.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -580,30 +570,26 @@ mod tests {
|
|||||||
let matching_words = MatchingWords::new(matching_words);
|
let matching_words = MatchingWords::new(matching_words);
|
||||||
|
|
||||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: true, crop: None };
|
let format_options = FormatOptions { highlight: true, crop: None };
|
||||||
|
|
||||||
// Text containing prefix match.
|
// Text containing prefix match.
|
||||||
let text = "Ŵôřlḑôle";
|
let text = "Ŵôřlḑôle";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop should return complete text with highlighted matches.
|
// no crop should return complete text with highlighted matches.
|
||||||
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle");
|
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>ôle");
|
||||||
|
|
||||||
// Text containing unicode match.
|
// Text containing unicode match.
|
||||||
let text = "Ŵôřlḑ";
|
let text = "Ŵôřlḑ";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop should return complete text with highlighted matches.
|
// no crop should return complete text with highlighted matches.
|
||||||
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>");
|
assert_eq!(&matcher.format(format_options), "<em>Ŵôřlḑ</em>");
|
||||||
|
|
||||||
// Text containing unicode match.
|
// Text containing unicode match.
|
||||||
let text = "Westfália";
|
let text = "Westfália";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no crop should return complete text with highlighted matches.
|
// no crop should return complete text with highlighted matches.
|
||||||
assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a");
|
assert_eq!(&matcher.format(format_options), "<em>Westfáli</em>a");
|
||||||
@ -614,28 +600,24 @@ mod tests {
|
|||||||
let matching_words = matching_words();
|
let matching_words = matching_words();
|
||||||
|
|
||||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: false, crop: Some(10) };
|
let format_options = FormatOptions { highlight: false, crop: Some(10) };
|
||||||
|
|
||||||
// empty text.
|
// empty text.
|
||||||
let text = "";
|
let text = "";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
assert_eq!(&matcher.format(format_options), "");
|
assert_eq!(&matcher.format(format_options), "");
|
||||||
|
|
||||||
// text containing only separators.
|
// text containing only separators.
|
||||||
let text = ":-)";
|
let text = ":-)";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
assert_eq!(&matcher.format(format_options), ":-)");
|
assert_eq!(&matcher.format(format_options), ":-)");
|
||||||
|
|
||||||
// Text without any match.
|
// Text without any match.
|
||||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no highlight should return 10 first words with a marker at the end.
|
// no highlight should return 10 first words with a marker at the end.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -645,8 +627,7 @@ mod tests {
|
|||||||
|
|
||||||
// Text without any match starting by a separator.
|
// Text without any match starting by a separator.
|
||||||
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
|
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no highlight should return 10 first words with a marker at the end.
|
// no highlight should return 10 first words with a marker at the end.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -656,19 +637,17 @@ mod tests {
|
|||||||
|
|
||||||
// Test phrase propagation
|
// Test phrase propagation
|
||||||
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
|
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// should crop the phrase instead of croping around the match.
|
// should crop the phrase instead of croping around the match.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
&matcher.format(format_options),
|
&matcher.format(format_options),
|
||||||
"…Split The World is a book written by Emily Henry…"
|
"… Split The World is a book written by Emily Henry…",
|
||||||
);
|
);
|
||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no highlight should return 10 last words with a marker at the start.
|
// no highlight should return 10 last words with a marker at the start.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -678,8 +657,7 @@ mod tests {
|
|||||||
|
|
||||||
// Text containing all matches.
|
// Text containing all matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// no highlight should return 10 last words with a marker at the start.
|
// no highlight should return 10 last words with a marker at the start.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -689,8 +667,7 @@ mod tests {
|
|||||||
|
|
||||||
// Text containing a match unordered and a match ordered.
|
// Text containing a match unordered and a match ordered.
|
||||||
let text = "The world split void void void void void void void void void split the world void void";
|
let text = "The world split void void void void void void void void void split the world void void";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// crop should return 10 last words with a marker at the start.
|
// crop should return 10 last words with a marker at the start.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -700,8 +677,7 @@ mod tests {
|
|||||||
|
|
||||||
// Text containing matches with diferent density.
|
// Text containing matches with diferent density.
|
||||||
let text = "split void the void void world void void void void void void void void void void split the world void void";
|
let text = "split void the void void world void void void void void void void void void void split the world void void";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// crop should return 10 last words with a marker at the start.
|
// crop should return 10 last words with a marker at the start.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -711,8 +687,7 @@ mod tests {
|
|||||||
|
|
||||||
// Text containing matches with same word.
|
// Text containing matches with same word.
|
||||||
let text = "split split split split split split void void void void void void void void void void split the world void void";
|
let text = "split split split split split split void void void void void void void void void void split the world void void";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// crop should return 10 last words with a marker at the start.
|
// crop should return 10 last words with a marker at the start.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -726,28 +701,24 @@ mod tests {
|
|||||||
let matching_words = matching_words();
|
let matching_words = matching_words();
|
||||||
|
|
||||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
let format_options = FormatOptions { highlight: true, crop: Some(10) };
|
||||||
|
|
||||||
// empty text.
|
// empty text.
|
||||||
let text = "";
|
let text = "";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
assert_eq!(&matcher.format(format_options), "");
|
assert_eq!(&matcher.format(format_options), "");
|
||||||
|
|
||||||
// text containing only separators.
|
// text containing only separators.
|
||||||
let text = ":-)";
|
let text = ":-)";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
assert_eq!(&matcher.format(format_options), ":-)");
|
assert_eq!(&matcher.format(format_options), ":-)");
|
||||||
|
|
||||||
// Text without any match.
|
// Text without any match.
|
||||||
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// both should return 10 first words with a marker at the end.
|
// both should return 10 first words with a marker at the end.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -757,8 +728,7 @@ mod tests {
|
|||||||
|
|
||||||
// Text containing some matches.
|
// Text containing some matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves.";
|
let text = "Natalie risk her future to build a world with the boy she loves.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -768,16 +738,14 @@ mod tests {
|
|||||||
|
|
||||||
// Text containing all matches.
|
// Text containing all matches.
|
||||||
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// both should return 10 last words with a marker at the start and highlighted matches.
|
// both should return 10 last words with a marker at the start and highlighted matches.
|
||||||
assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
|
assert_eq!(&matcher.format(format_options), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
|
||||||
|
|
||||||
// Text containing a match unordered and a match ordered.
|
// Text containing a match unordered and a match ordered.
|
||||||
let text = "The world split void void void void void void void void void split the world void void";
|
let text = "The world split void void void void void void void void void split the world void void";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
// crop should return 10 last words with a marker at the start.
|
// crop should return 10 last words with a marker at the start.
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
@ -792,11 +760,9 @@ mod tests {
|
|||||||
let matching_words = matching_words();
|
let matching_words = matching_words();
|
||||||
|
|
||||||
let builder = MatcherBuilder::from_matching_words(matching_words);
|
let builder = MatcherBuilder::from_matching_words(matching_words);
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
|
||||||
|
|
||||||
let text = "void void split the world void void.";
|
let text = "void void split the world void void.";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
|
|
||||||
// set a smaller crop size
|
// set a smaller crop size
|
||||||
let format_options = FormatOptions { highlight: false, crop: Some(2) };
|
let format_options = FormatOptions { highlight: false, crop: Some(2) };
|
||||||
@ -847,13 +813,11 @@ mod tests {
|
|||||||
let mut builder = MatcherBuilder::from_matching_words(matching_words);
|
let mut builder = MatcherBuilder::from_matching_words(matching_words);
|
||||||
builder.highlight_prefix("_".to_string());
|
builder.highlight_prefix("_".to_string());
|
||||||
builder.highlight_suffix("_".to_string());
|
builder.highlight_suffix("_".to_string());
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
|
||||||
|
|
||||||
let format_options = FormatOptions { highlight: true, crop: None };
|
let format_options = FormatOptions { highlight: true, crop: None };
|
||||||
|
|
||||||
let text = "the do or die can't be he do and or isn't he";
|
let text = "the do or die can't be he do and or isn't he";
|
||||||
let analyzed = analyzer.analyze(&text);
|
let tokens: Vec<_> = text.tokenize().collect();
|
||||||
let tokens: Vec<_> = analyzed.tokens().collect();
|
|
||||||
|
|
||||||
let mut matcher = builder.build(&tokens[..], text);
|
let mut matcher = builder.build(&tokens[..], text);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -6,12 +6,12 @@ use std::result::Result as StdResult;
|
|||||||
use std::str::Utf8Error;
|
use std::str::Utf8Error;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
|
use charabia::TokenizerBuilder;
|
||||||
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
|
use distinct::{Distinct, DocIter, FacetDistinct, NoopDistinct};
|
||||||
use fst::automaton::Str;
|
use fst::automaton::Str;
|
||||||
use fst::{Automaton, IntoStreamer, Streamer};
|
use fst::{Automaton, IntoStreamer, Streamer};
|
||||||
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
use levenshtein_automata::{LevenshteinAutomatonBuilder as LevBuilder, DFA};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
|
||||||
use once_cell::sync::Lazy;
|
use once_cell::sync::Lazy;
|
||||||
use roaring::bitmap::RoaringBitmap;
|
use roaring::bitmap::RoaringBitmap;
|
||||||
|
|
||||||
@ -126,14 +126,14 @@ impl<'a> Search<'a> {
|
|||||||
builder.words_limit(self.words_limit);
|
builder.words_limit(self.words_limit);
|
||||||
// We make sure that the analyzer is aware of the stop words
|
// We make sure that the analyzer is aware of the stop words
|
||||||
// this ensures that the query builder is able to properly remove them.
|
// this ensures that the query builder is able to properly remove them.
|
||||||
let mut config = AnalyzerConfig::default();
|
let mut tokbuilder = TokenizerBuilder::new();
|
||||||
let stop_words = self.index.stop_words(self.rtxn)?;
|
let stop_words = self.index.stop_words(self.rtxn)?;
|
||||||
if let Some(ref stop_words) = stop_words {
|
if let Some(ref stop_words) = stop_words {
|
||||||
config.stop_words(stop_words);
|
tokbuilder.stop_words(stop_words);
|
||||||
}
|
}
|
||||||
let analyzer = Analyzer::new(config);
|
|
||||||
let result = analyzer.analyze(query);
|
let tokenizer = tokbuilder.build();
|
||||||
let tokens = result.tokens();
|
let tokens = tokenizer.tokenize(query);
|
||||||
builder
|
builder
|
||||||
.build(tokens)?
|
.build(tokens)?
|
||||||
.map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw)))
|
.map_or((None, None, None), |(qt, pq, mw)| (Some(qt), Some(pq), Some(mw)))
|
||||||
|
@ -1,10 +1,9 @@
|
|||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::{cmp, fmt, mem};
|
use std::{cmp, fmt, mem};
|
||||||
|
|
||||||
|
use charabia::classifier::ClassifiedTokenIter;
|
||||||
|
use charabia::{SeparatorKind, TokenKind};
|
||||||
use fst::Set;
|
use fst::Set;
|
||||||
use meilisearch_tokenizer::token::SeparatorKind;
|
|
||||||
use meilisearch_tokenizer::tokenizer::TokenStream;
|
|
||||||
use meilisearch_tokenizer::TokenKind;
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use slice_group_by::GroupBy;
|
use slice_group_by::GroupBy;
|
||||||
|
|
||||||
@ -235,9 +234,9 @@ impl<'a> QueryTreeBuilder<'a> {
|
|||||||
/// - if `authorize_typos` is set to `false` the query tree will be generated
|
/// - if `authorize_typos` is set to `false` the query tree will be generated
|
||||||
/// forcing all query words to match documents without any typo
|
/// forcing all query words to match documents without any typo
|
||||||
/// (the criterion `typo` will be ignored)
|
/// (the criterion `typo` will be ignored)
|
||||||
pub fn build(
|
pub fn build<A: AsRef<[u8]>>(
|
||||||
&self,
|
&self,
|
||||||
query: TokenStream,
|
query: ClassifiedTokenIter<A>,
|
||||||
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
|
) -> Result<Option<(Operation, PrimitiveQuery, MatchingWords)>> {
|
||||||
let stop_words = self.index.stop_words(self.rtxn)?;
|
let stop_words = self.index.stop_words(self.rtxn)?;
|
||||||
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
|
let primitive_query = create_primitive_query(query, stop_words, self.words_limit);
|
||||||
@ -649,11 +648,14 @@ impl PrimitiveQueryPart {
|
|||||||
|
|
||||||
/// Create primitive query from tokenized query string,
|
/// Create primitive query from tokenized query string,
|
||||||
/// the primitive query is an intermediate state to build the query tree.
|
/// the primitive query is an intermediate state to build the query tree.
|
||||||
fn create_primitive_query(
|
fn create_primitive_query<A>(
|
||||||
query: TokenStream,
|
query: ClassifiedTokenIter<A>,
|
||||||
stop_words: Option<Set<&[u8]>>,
|
stop_words: Option<Set<&[u8]>>,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
) -> PrimitiveQuery {
|
) -> PrimitiveQuery
|
||||||
|
where
|
||||||
|
A: AsRef<[u8]>,
|
||||||
|
{
|
||||||
let mut primitive_query = Vec::new();
|
let mut primitive_query = Vec::new();
|
||||||
let mut phrase = Vec::new();
|
let mut phrase = Vec::new();
|
||||||
let mut quoted = false;
|
let mut quoted = false;
|
||||||
@ -673,21 +675,18 @@ fn create_primitive_query(
|
|||||||
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
||||||
// 3. if the word is the last token of the query we push it as a prefix word.
|
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||||
if quoted {
|
if quoted {
|
||||||
phrase.push(token.word.to_string());
|
phrase.push(token.lemma().to_string());
|
||||||
} else if peekable.peek().is_some() {
|
} else if peekable.peek().is_some() {
|
||||||
if !stop_words
|
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.lemma())) {
|
||||||
.as_ref()
|
|
||||||
.map_or(false, |swords| swords.contains(token.word.as_ref()))
|
|
||||||
{
|
|
||||||
primitive_query
|
primitive_query
|
||||||
.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
|
.push(PrimitiveQueryPart::Word(token.lemma().to_string(), false));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true));
|
primitive_query.push(PrimitiveQueryPart::Word(token.lemma().to_string(), true));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TokenKind::Separator(separator_kind) => {
|
TokenKind::Separator(separator_kind) => {
|
||||||
let quote_count = token.word.chars().filter(|&s| s == '"').count();
|
let quote_count = token.lemma().chars().filter(|&s| s == '"').count();
|
||||||
// swap quoted state if we encounter a double quote
|
// swap quoted state if we encounter a double quote
|
||||||
if quote_count % 2 != 0 {
|
if quote_count % 2 != 0 {
|
||||||
quoted = !quoted;
|
quoted = !quoted;
|
||||||
@ -738,8 +737,8 @@ pub fn maximum_proximity(operation: &Operation) -> usize {
|
|||||||
mod test {
|
mod test {
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
|
use charabia::Tokenize;
|
||||||
use maplit::hashmap;
|
use maplit::hashmap;
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
use rand::{Rng, SeedableRng};
|
use rand::{Rng, SeedableRng};
|
||||||
|
|
||||||
@ -754,12 +753,12 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl TestContext {
|
impl TestContext {
|
||||||
fn build(
|
fn build<A: AsRef<[u8]>>(
|
||||||
&self,
|
&self,
|
||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
query: TokenStream,
|
query: ClassifiedTokenIter<A>,
|
||||||
) -> Result<Option<(Operation, PrimitiveQuery)>> {
|
) -> Result<Option<(Operation, PrimitiveQuery)>> {
|
||||||
let primitive_query = create_primitive_query(query, None, words_limit);
|
let primitive_query = create_primitive_query(query, None, words_limit);
|
||||||
if !primitive_query.is_empty() {
|
if !primitive_query.is_empty() {
|
||||||
@ -856,9 +855,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn prefix() {
|
fn prefix() {
|
||||||
let query = "hey friends";
|
let query = "hey friends";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
false,
|
false,
|
||||||
@ -889,9 +886,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn no_prefix() {
|
fn no_prefix() {
|
||||||
let query = "hey friends ";
|
let query = "hey friends ";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
false,
|
false,
|
||||||
@ -922,9 +917,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn synonyms() {
|
fn synonyms() {
|
||||||
let query = "hello world ";
|
let query = "hello world ";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
false,
|
false,
|
||||||
@ -987,9 +980,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn complex_synonyms() {
|
fn complex_synonyms() {
|
||||||
let query = "new york city ";
|
let query = "new york city ";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
false,
|
false,
|
||||||
@ -1087,9 +1078,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn ngrams() {
|
fn ngrams() {
|
||||||
let query = "n grams ";
|
let query = "n grams ";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
false,
|
false,
|
||||||
@ -1120,9 +1109,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn word_split() {
|
fn word_split() {
|
||||||
let query = "wordsplit fish ";
|
let query = "wordsplit fish ";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
false,
|
false,
|
||||||
@ -1159,9 +1146,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn phrase() {
|
fn phrase() {
|
||||||
let query = "\"hey friends\" \" \" \"wooop";
|
let query = "\"hey friends\" \" \" \"wooop";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::And(vec![
|
let expected = Operation::And(vec![
|
||||||
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
|
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
|
||||||
@ -1177,9 +1162,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn phrase_with_hard_separator() {
|
fn phrase_with_hard_separator() {
|
||||||
let query = "\"hey friends. wooop wooop\"";
|
let query = "\"hey friends. wooop wooop\"";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::And(vec![
|
let expected = Operation::And(vec![
|
||||||
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
|
Operation::Phrase(vec!["hey".to_string(), "friends".to_string()]),
|
||||||
@ -1195,9 +1178,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn optional_word() {
|
fn optional_word() {
|
||||||
let query = "hey my friend ";
|
let query = "hey my friend ";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
true,
|
true,
|
||||||
@ -1280,9 +1261,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn optional_word_phrase() {
|
fn optional_word_phrase() {
|
||||||
let query = "\"hey my\"";
|
let query = "\"hey my\"";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]);
|
let expected = Operation::Phrase(vec!["hey".to_string(), "my".to_string()]);
|
||||||
let (query_tree, _) =
|
let (query_tree, _) =
|
||||||
@ -1294,9 +1273,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn optional_word_multiple_phrases() {
|
fn optional_word_multiple_phrases() {
|
||||||
let query = r#""hey" my good "friend""#;
|
let query = r#""hey" my good "friend""#;
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
true,
|
true,
|
||||||
@ -1365,9 +1342,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn no_typo() {
|
fn no_typo() {
|
||||||
let query = "hey friends ";
|
let query = "hey friends ";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::Or(
|
let expected = Operation::Or(
|
||||||
false,
|
false,
|
||||||
@ -1397,9 +1372,7 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn words_limit() {
|
fn words_limit() {
|
||||||
let query = "\"hey my\" good friend";
|
let query = "\"hey my\" good friend";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
let tokens = result.tokens();
|
|
||||||
|
|
||||||
let expected = Operation::And(vec![
|
let expected = Operation::And(vec![
|
||||||
Operation::Phrase(vec!["hey".to_string(), "my".to_string()]),
|
Operation::Phrase(vec!["hey".to_string(), "my".to_string()]),
|
||||||
@ -1441,10 +1414,8 @@ mod test {
|
|||||||
#[test]
|
#[test]
|
||||||
fn disable_typo_on_word() {
|
fn disable_typo_on_word() {
|
||||||
let query = "goodbye";
|
let query = "goodbye";
|
||||||
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
|
let tokens = query.tokenize();
|
||||||
let result = analyzer.analyze(query);
|
|
||||||
|
|
||||||
let tokens = result.tokens();
|
|
||||||
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
|
let exact_words = fst::Set::from_iter(Some("goodbye")).unwrap().into_fst().into_inner();
|
||||||
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
|
let exact_words = Some(fst::Set::new(exact_words).unwrap().map_data(Cow::Owned).unwrap());
|
||||||
let context = TestContext { exact_words, ..Default::default() };
|
let context = TestContext { exact_words, ..Default::default() };
|
||||||
|
@ -3,8 +3,7 @@ use std::convert::TryInto;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::{io, mem, str};
|
use std::{io, mem, str};
|
||||||
|
|
||||||
use meilisearch_tokenizer::token::SeparatorKind;
|
use charabia::{SeparatorKind, Token, TokenKind, TokenizerBuilder};
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig, Token, TokenKind};
|
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
|
|
||||||
@ -40,11 +39,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
|
|
||||||
let mut key_buffer = Vec::new();
|
let mut key_buffer = Vec::new();
|
||||||
let mut field_buffer = String::new();
|
let mut field_buffer = String::new();
|
||||||
let mut config = AnalyzerConfig::default();
|
let mut builder = TokenizerBuilder::new();
|
||||||
if let Some(stop_words) = stop_words {
|
if let Some(stop_words) = stop_words {
|
||||||
config.stop_words(stop_words);
|
builder.stop_words(stop_words);
|
||||||
}
|
}
|
||||||
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
let mut cursor = obkv_documents.into_cursor()?;
|
let mut cursor = obkv_documents.into_cursor()?;
|
||||||
while let Some((key, value)) = cursor.move_on_next()? {
|
while let Some((key, value)) = cursor.move_on_next()? {
|
||||||
@ -64,12 +63,11 @@ pub fn extract_docid_word_positions<R: io::Read + io::Seek>(
|
|||||||
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
serde_json::from_slice(field_bytes).map_err(InternalError::SerdeJson)?;
|
||||||
field_buffer.clear();
|
field_buffer.clear();
|
||||||
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
if let Some(field) = json_to_string(&value, &mut field_buffer) {
|
||||||
let analyzed = analyzer.analyze(field);
|
let tokens = process_tokens(tokenizer.tokenize(field))
|
||||||
let tokens = process_tokens(analyzed.tokens())
|
|
||||||
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
.take_while(|(p, _)| (*p as u32) < max_positions_per_attributes);
|
||||||
|
|
||||||
for (index, token) in tokens {
|
for (index, token) in tokens {
|
||||||
let token = token.text().trim();
|
let token = token.lemma().trim();
|
||||||
if !token.is_empty() {
|
if !token.is_empty() {
|
||||||
key_buffer.truncate(mem::size_of::<u32>());
|
key_buffer.truncate(mem::size_of::<u32>());
|
||||||
key_buffer.extend_from_slice(token.as_bytes());
|
key_buffer.extend_from_slice(token.as_bytes());
|
||||||
@ -146,7 +144,7 @@ fn process_tokens<'a>(
|
|||||||
tokens: impl Iterator<Item = Token<'a>>,
|
tokens: impl Iterator<Item = Token<'a>>,
|
||||||
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
) -> impl Iterator<Item = (usize, Token<'a>)> {
|
||||||
tokens
|
tokens
|
||||||
.skip_while(|token| token.is_separator().is_some())
|
.skip_while(|token| token.is_separator())
|
||||||
.scan((0, None), |(offset, prev_kind), token| {
|
.scan((0, None), |(offset, prev_kind), token| {
|
||||||
match token.kind {
|
match token.kind {
|
||||||
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
TokenKind::Word | TokenKind::StopWord | TokenKind::Unknown => {
|
||||||
|
@ -1,8 +1,8 @@
|
|||||||
use std::collections::{BTreeSet, HashMap, HashSet};
|
use std::collections::{BTreeSet, HashMap, HashSet};
|
||||||
use std::result::Result as StdResult;
|
use std::result::Result as StdResult;
|
||||||
|
|
||||||
|
use charabia::{Tokenizer, TokenizerBuilder};
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
|
|
||||||
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
use serde::{Deserialize, Deserializer, Serialize, Serializer};
|
||||||
use time::OffsetDateTime;
|
use time::OffsetDateTime;
|
||||||
|
|
||||||
@ -385,13 +385,12 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
fn update_synonyms(&mut self) -> Result<bool> {
|
fn update_synonyms(&mut self) -> Result<bool> {
|
||||||
match self.synonyms {
|
match self.synonyms {
|
||||||
Setting::Set(ref synonyms) => {
|
Setting::Set(ref synonyms) => {
|
||||||
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> Vec<String> {
|
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> Vec<String> {
|
||||||
analyzer
|
tokenizer
|
||||||
.analyze(text)
|
.tokenize(text)
|
||||||
.tokens()
|
|
||||||
.filter_map(|token| {
|
.filter_map(|token| {
|
||||||
if token.is_word() {
|
if token.is_word() {
|
||||||
Some(token.text().to_string())
|
Some(token.lemma().to_string())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
@ -399,19 +398,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>()
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut config = AnalyzerConfig::default();
|
let mut builder = TokenizerBuilder::new();
|
||||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
if let Some(stop_words) = &stop_words {
|
if let Some(ref stop_words) = stop_words {
|
||||||
config.stop_words(stop_words);
|
builder.stop_words(stop_words);
|
||||||
}
|
}
|
||||||
let analyzer = Analyzer::new(config);
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
let mut new_synonyms = HashMap::new();
|
let mut new_synonyms = HashMap::new();
|
||||||
for (word, synonyms) in synonyms {
|
for (word, synonyms) in synonyms {
|
||||||
// Normalize both the word and associated synonyms.
|
// Normalize both the word and associated synonyms.
|
||||||
let normalized_word = normalize(&analyzer, word);
|
let normalized_word = normalize(&tokenizer, word);
|
||||||
let normalized_synonyms =
|
let normalized_synonyms =
|
||||||
synonyms.iter().map(|synonym| normalize(&analyzer, synonym));
|
synonyms.iter().map(|synonym| normalize(&tokenizer, synonym));
|
||||||
|
|
||||||
// Store the normalized synonyms under the normalized word,
|
// Store the normalized synonyms under the normalized word,
|
||||||
// merging the possible duplicate words.
|
// merging the possible duplicate words.
|
||||||
@ -584,19 +583,19 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
|
|||||||
fn update_exact_words(&mut self) -> Result<()> {
|
fn update_exact_words(&mut self) -> Result<()> {
|
||||||
match self.exact_words {
|
match self.exact_words {
|
||||||
Setting::Set(ref mut words) => {
|
Setting::Set(ref mut words) => {
|
||||||
fn normalize(analyzer: &Analyzer<&[u8]>, text: &str) -> String {
|
fn normalize(tokenizer: &Tokenizer<&[u8]>, text: &str) -> String {
|
||||||
analyzer.analyze(text).tokens().map(|token| token.text().to_string()).collect()
|
tokenizer.tokenize(text).map(|token| token.lemma().to_string()).collect()
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut config = AnalyzerConfig::default();
|
let mut builder = TokenizerBuilder::new();
|
||||||
let stop_words = self.index.stop_words(self.wtxn)?;
|
let stop_words = self.index.stop_words(self.wtxn)?;
|
||||||
if let Some(stop_words) = &stop_words {
|
if let Some(ref stop_words) = stop_words {
|
||||||
config.stop_words(stop_words);
|
builder.stop_words(stop_words);
|
||||||
}
|
}
|
||||||
let analyzer = Analyzer::new(config);
|
let tokenizer = builder.build();
|
||||||
|
|
||||||
let mut words: Vec<_> =
|
let mut words: Vec<_> =
|
||||||
words.iter().map(|word| normalize(&analyzer, word)).collect();
|
words.iter().map(|word| normalize(&tokenizer, word)).collect();
|
||||||
|
|
||||||
// normalization could reorder words
|
// normalization could reorder words
|
||||||
words.sort_unstable();
|
words.sort_unstable();
|
||||||
|
Loading…
Reference in New Issue
Block a user