MeiliSearch/milli/src/search/matches/mod.rs

791 lines
33 KiB
Rust
Raw Normal View History

2022-03-22 15:22:14 +01:00
use std::borrow::Cow;
2022-03-29 14:57:21 +02:00
pub use matching_words::MatchingWords;
2022-04-04 18:56:59 +02:00
use matching_words::{MatchType, PrimitiveWordId};
2022-03-30 15:22:18 +02:00
use meilisearch_tokenizer::token::{SeparatorKind, Token};
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
pub mod matching_words;
2022-03-22 15:22:14 +01:00
const DEFAULT_CROP_SIZE: usize = 10;
const DEFAULT_CROP_MARKER: &'static str = "";
const DEFAULT_HIGHLIGHT_PREFIX: &'static str = "<em>";
const DEFAULT_HIGHLIGHT_SUFFIX: &'static str = "</em>";
pub struct MatcherBuilder {
matching_words: MatchingWords,
crop_size: usize,
crop_marker: Option<String>,
highlight_prefix: Option<String>,
highlight_suffix: Option<String>,
}
impl MatcherBuilder {
2022-03-30 10:50:23 +02:00
pub fn from_matching_words(matching_words: MatchingWords) -> Self {
Self {
matching_words,
crop_size: DEFAULT_CROP_SIZE,
crop_marker: None,
highlight_prefix: None,
highlight_suffix: None,
}
}
2022-03-22 15:22:14 +01:00
pub fn crop_size(&mut self, word_count: usize) -> &Self {
self.crop_size = word_count;
self
}
pub fn crop_marker(&mut self, marker: String) -> &Self {
self.crop_marker = Some(marker);
self
}
pub fn highlight_prefix(&mut self, prefix: String) -> &Self {
self.highlight_prefix = Some(prefix);
self
}
pub fn highlight_suffix(&mut self, suffix: String) -> &Self {
self.highlight_suffix = Some(suffix);
self
}
pub fn build<'t, 'm>(&'m self, tokens: &'t [Token], text: &'t str) -> Matcher<'t, 'm> {
let crop_marker = match &self.crop_marker {
Some(marker) => marker.as_str(),
None => &DEFAULT_CROP_MARKER,
};
let highlight_prefix = match &self.highlight_prefix {
Some(marker) => marker.as_str(),
None => &DEFAULT_HIGHLIGHT_PREFIX,
};
let highlight_suffix = match &self.highlight_suffix {
Some(marker) => marker.as_str(),
None => &DEFAULT_HIGHLIGHT_SUFFIX,
};
Matcher {
text,
tokens,
matching_words: &self.matching_words,
crop_size: self.crop_size,
crop_marker,
highlight_prefix,
highlight_suffix,
matches: None,
}
}
}
2022-03-28 18:17:50 +02:00
#[derive(Clone, Debug)]
pub struct Match {
2022-03-22 15:22:14 +01:00
match_len: usize,
2022-04-04 18:56:59 +02:00
// ids of the query words that matches.
ids: Vec<PrimitiveWordId>,
2022-03-22 15:22:14 +01:00
// position of the word in the whole text.
word_position: usize,
// position of the token in the whole text.
token_position: usize,
2022-03-22 15:22:14 +01:00
}
2022-03-28 18:17:50 +02:00
#[derive(Clone, Debug)]
2022-03-22 15:22:14 +01:00
pub struct MatchBounds {
2022-03-30 15:22:18 +02:00
pub start: usize,
pub length: usize,
2022-03-22 15:22:14 +01:00
}
pub struct Matcher<'t, 'm> {
text: &'t str,
tokens: &'t [Token<'t>],
matching_words: &'m MatchingWords,
crop_size: usize,
crop_marker: &'m str,
highlight_prefix: &'m str,
highlight_suffix: &'m str,
matches: Option<Vec<Match>>,
2022-03-22 15:22:14 +01:00
}
impl<'t> Matcher<'t, '_> {
fn compute_matches(&mut self) -> &mut Self {
let mut matches = Vec::new();
let mut word_position = 0;
let mut token_position = 0;
2022-04-04 18:56:59 +02:00
while let Some(token) = self.tokens.get(token_position) {
if token.is_separator().is_none() {
2022-04-04 18:56:59 +02:00
'matches: for match_type in self.matching_words.match_token(&token) {
match match_type {
MatchType::Full { char_len, ids } => {
matches.push(Match {
match_len: char_len,
ids: ids.to_vec(),
word_position,
token_position,
});
// stop on the first match
break;
}
MatchType::Partial(mut partial) => {
let mut potential_matches =
vec![(token_position, word_position, partial.char_len())];
let mut t_position = 1;
let mut w_position = 1;
'partials: for token in &self.tokens[token_position + 1..] {
if token.is_separator().is_none() {
partial = match partial.match_token(&token) {
Some(MatchType::Partial(partial)) => {
potential_matches.push((
token_position + t_position,
word_position + w_position,
partial.char_len(),
));
partial
}
// partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { char_len, ids }) => {
let iter = potential_matches.into_iter().map(
|(token_position, word_position, match_len)| {
Match {
match_len,
ids: ids.to_vec(),
word_position,
token_position,
}
},
);
matches.extend(iter);
word_position += w_position;
token_position += t_position;
matches.push(Match {
match_len: char_len,
ids: ids.to_vec(),
word_position,
token_position,
});
break 'matches;
}
// no match, continue to next match.
None => break 'partials,
};
w_position += 1;
}
t_position += 1;
}
}
}
2022-03-22 15:22:14 +01:00
}
word_position += 1;
2022-03-22 15:22:14 +01:00
}
token_position += 1;
2022-03-22 15:22:14 +01:00
}
self.matches = Some(matches);
self
}
pub fn matches(&mut self) -> Vec<MatchBounds> {
match &self.matches {
None => self.compute_matches().matches(),
Some(matches) => matches
.iter()
.map(|m| MatchBounds {
start: self.tokens[m.token_position].byte_start,
length: m.match_len,
})
.collect(),
}
}
2022-03-28 18:17:50 +02:00
fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
// TODO: buggy if no match and first token is a sepparator
let mut remaining_words =
self.crop_size + first_match_word_position - last_match_word_position;
// if first token is a word, then remove 1 to remaining_words.
if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
remaining_words -= 1;
}
let mut first_token_position = first_match_token_position;
let mut last_token_position = last_match_token_position;
while remaining_words > 0 {
match (
first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
) {
(Some(ft), Some(lt)) => {
match (ft.is_separator(), lt.is_separator()) {
// if they are both separators and are the same kind then advance both
(Some(f_kind), Some(s_kind)) => {
if f_kind == s_kind {
first_token_position -= 1;
last_token_position += 1;
} else if f_kind == SeparatorKind::Hard {
last_token_position += 1;
} else {
first_token_position -= 1;
}
}
// left is a word, advance left
(None, Some(_)) => {
first_token_position -= 1;
remaining_words -= 1;
}
// right is a word, advance right
(Some(_), None) => {
last_token_position += 1;
remaining_words -= 1;
}
// both are words, advance left then right if remaining_word > 0
(None, None) => {
first_token_position -= 1;
remaining_words -= 1;
if remaining_words > 0 {
last_token_position += 1;
remaining_words -= 1;
}
}
}
}
// the end of the text is reached, advance left.
(Some(ft), None) => {
first_token_position -= 1;
if ft.is_separator().is_none() {
remaining_words -= 1;
}
}
// the start of the text is reached, advance right.
(None, Some(lt)) => {
last_token_position += 1;
if lt.is_separator().is_none() {
remaining_words -= 1;
}
}
// no more token to add.
(None, None) => break,
}
}
2022-03-28 18:17:50 +02:00
(first_token_position, last_token_position)
2022-03-22 15:22:14 +01:00
}
2022-03-28 18:17:50 +02:00
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
2022-04-04 18:56:59 +02:00
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
2022-03-28 18:17:50 +02:00
let mut order_score = 0;
let mut distance_score = 0;
let mut iter = matches.iter().peekable();
while let Some(m) = iter.next() {
if let Some(next_match) = iter.peek() {
// if matches are ordered
2022-04-04 18:56:59 +02:00
if next_match.ids.iter().min() > m.ids.iter().min() {
2022-03-28 18:17:50 +02:00
order_score += 1;
}
// compute distance between matches
distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
}
2022-04-04 18:56:59 +02:00
ids.extend(m.ids.iter());
2022-03-28 18:17:50 +02:00
}
ids.sort_unstable();
ids.dedup();
let uniq_score = ids.len() as i16;
// rank by unique match count, then by distance between matches, then by ordered match count.
(uniq_score, distance_score, order_score)
}
fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
if matches.len() > 1 {
let mut best_interval = (0, 0);
let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
2022-03-28 18:17:50 +02:00
let mut interval_first = 0;
let mut interval_last = 0;
for (index, next_match) in matches.iter().enumerate().skip(1) {
2022-03-28 18:17:50 +02:00
// if next match would make interval gross more than crop_size
if next_match.word_position - matches[interval_first].word_position
>= self.crop_size
2022-03-28 18:17:50 +02:00
{
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
// keep interval if it's the best
if interval_score > best_interval_score {
best_interval = (interval_first, interval_last);
best_interval_score = interval_score;
}
// advance start of the interval while interval is longer than crop_size
while next_match.word_position - matches[interval_first].word_position
>= self.crop_size
2022-03-28 18:17:50 +02:00
{
interval_first += 1;
}
}
interval_last = index;
}
let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]);
if interval_score > best_interval_score {
best_interval = (interval_first, interval_last);
}
&matches[best_interval.0..=best_interval.1]
} else {
matches
}
2022-03-22 15:22:14 +01:00
}
2022-03-28 18:17:50 +02:00
fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
let match_interval = self.find_best_match_interval(matches);
let (first_token_position, last_token_position) = self.token_crop_bounds(match_interval);
let byte_start = self.tokens.get(first_token_position).map_or(0, |t| t.byte_start);
let byte_end = self.tokens.get(last_token_position).map_or(byte_start, |t| t.byte_end);
(byte_start, byte_end)
2022-03-28 18:17:50 +02:00
}
2022-03-22 15:22:14 +01:00
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
// If 0 it will be considered null and thus not crop the field
// https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let crop = crop && self.crop_size > 0;
2022-03-22 15:22:14 +01:00
if !highlight && !crop {
// compute matches is not needed if no highlight or crop is requested.
Cow::Borrowed(self.text)
} else {
match &self.matches {
Some(matches) => {
let (byte_start, byte_end) =
if crop { self.crop_bounds(matches) } else { (0, self.text.len()) };
let mut formatted = Vec::new();
// push crop marker if it's not the start of the text.
if byte_start > 0 && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker);
}
let mut byte_index = byte_start;
if highlight {
// insert highlight markers around matches.
let tokens = self.tokens;
2022-03-22 15:22:14 +01:00
for m in matches
.iter()
.skip_while(|m| tokens[m.token_position].byte_start < byte_start)
.take_while(|m| tokens[m.token_position].byte_start < byte_end)
2022-03-22 15:22:14 +01:00
{
let token = &tokens[m.token_position];
if byte_index < token.byte_start {
formatted.push(&self.text[byte_index..token.byte_start]);
2022-03-22 15:22:14 +01:00
}
let highlight_byte_index = self.text[token.byte_start..]
.char_indices()
.enumerate()
.find(|(i, _)| *i == m.match_len)
2022-04-04 18:56:59 +02:00
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start)
.min(token.byte_end);
2022-03-22 15:22:14 +01:00
formatted.push(self.highlight_prefix);
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
2022-03-22 15:22:14 +01:00
formatted.push(self.highlight_suffix);
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
2022-03-22 15:22:14 +01:00
byte_index = token.byte_end;
2022-03-22 15:22:14 +01:00
}
}
// push the rest of the text between last match and the end of crop.
if byte_index < byte_end {
formatted.push(&self.text[byte_index..byte_end]);
}
// push crop marker if it's not the end of the text.
if byte_end < self.text.len() && !self.crop_marker.is_empty() {
formatted.push(self.crop_marker);
}
if formatted.len() == 1 {
// avoid concatenating if there is already 1 slice.
Cow::Borrowed(&self.text[byte_start..byte_end])
} else {
Cow::Owned(formatted.concat())
}
}
None => self.compute_matches().format(highlight, crop),
}
}
}
}
#[cfg(test)]
mod tests {
use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
2022-03-22 15:22:14 +01:00
use super::*;
2022-04-04 18:56:59 +02:00
use crate::search::matches::matching_words::MatchingWord;
fn matching_words() -> MatchingWords {
let matching_words = vec![
(vec![MatchingWord::new("split".to_string(), 0, false)], vec![0]),
(vec![MatchingWord::new("the".to_string(), 0, false)], vec![1]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![2]),
];
MatchingWords::new(matching_words)
2022-03-22 15:22:14 +01:00
}
#[test]
fn format_identity() {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words();
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder::from_matching_words(matching_words);
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;
let crop = false;
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(highlight, crop), &text);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
2022-03-22 15:22:14 +01:00
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(highlight, crop), &text);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop and no highlight should return complete text.
assert_eq!(&matcher.format(highlight, crop), &text);
}
#[test]
fn format_highlight() {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words();
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder::from_matching_words(matching_words);
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = true;
let crop = false;
// empty text.
let text = "";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(highlight, crop), "");
// text containing only separators.
let text = ":-)";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(highlight, crop), ":-)");
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text, because there is no matches.
assert_eq!(&matcher.format(highlight, crop), &text);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
2022-03-22 15:22:14 +01:00
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
2022-03-22 15:22:14 +01:00
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
assert_eq!(
&matcher.format(highlight, crop),
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."
);
}
#[test]
fn highlight_unicode() {
2022-04-04 18:56:59 +02:00
let matching_words = vec![
(vec![MatchingWord::new("wessfali".to_string(), 1, true)], vec![0]),
(vec![MatchingWord::new("world".to_string(), 1, true)], vec![1]),
];
let matching_words = MatchingWords::new(matching_words);
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
2022-03-30 15:15:14 +02:00
let highlight = true;
let crop = false;
// Text containing prefix match.
let text = "Ŵôřlḑôle";
2022-03-30 15:15:14 +02:00
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(highlight, crop), "<em>Ŵôřlḑ</em>ôle");
// Text containing unicode match.
let text = "Ŵôřlḑ";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
assert_eq!(&matcher.format(highlight, crop), "<em>Ŵôřlḑ</em>");
// Text containing unicode match.
let text = "Westfália";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no crop should return complete text with highlighted matches.
2022-04-04 18:56:59 +02:00
assert_eq!(&matcher.format(highlight, crop), "<em>Westfáli</em>a");
2022-03-22 15:22:14 +01:00
}
#[test]
fn format_crop() {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words();
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder::from_matching_words(matching_words);
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;
let crop = true;
// empty text.
let text = "";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(highlight, crop), "");
// text containing only separators.
let text = ":-)";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(highlight, crop), ":-)");
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"A quick brown fox can not jump 32 feet, right…"
2022-03-22 15:22:14 +01:00
);
// Text without any match starting by a separator.
let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"(A quick brown fox can not jump 32 feet, right…"
);
// Test phrase propagation
let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.";
2022-03-22 15:22:14 +01:00
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// should crop the phrase instead of croping around the match.
2022-03-22 15:22:14 +01:00
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…Split The World is a book written by Emily Henry…"
2022-03-22 15:22:14 +01:00
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…future to build a world with the boy she loves…"
2022-03-22 15:22:14 +01:00
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// no highlight should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…she loves. Emily Henry: The Love That Split The World."
);
2022-03-22 15:22:14 +01:00
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…void void void void void split the world void void"
2022-03-22 15:22:14 +01:00
);
2022-03-28 18:17:50 +02:00
// Text containing matches with diferent density.
let text = "split void the void void world void void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…void void void void void split the world void void"
2022-03-28 18:17:50 +02:00
);
// Text containing matches with same word.
let text = "split split split split split split void void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…void void void void void split the world void void"
2022-03-28 18:17:50 +02:00
);
2022-03-22 15:22:14 +01:00
}
#[test]
fn format_highlight_crop() {
2022-04-04 18:56:59 +02:00
let matching_words = matching_words();
2022-03-22 15:22:14 +01:00
2022-04-04 18:56:59 +02:00
let builder = MatcherBuilder::from_matching_words(matching_words);
2022-03-22 15:22:14 +01:00
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = true;
let crop = true;
// empty text.
let text = "";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(highlight, crop), "");
// text containing only separators.
let text = ":-)";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
assert_eq!(&matcher.format(highlight, crop), ":-)");
2022-03-22 15:22:14 +01:00
// Text without any match.
let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 first words with a marker at the end.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"A quick brown fox can not jump 32 feet, right…"
2022-03-22 15:22:14 +01:00
);
// Text containing some matches.
let text = "Natalie risk her future to build a world with the boy she loves.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…future to build a <em>world</em> with <em>the</em> boy she loves…"
2022-03-22 15:22:14 +01:00
);
// Text containing all matches.
let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// both should return 10 last words with a marker at the start and highlighted matches.
2022-03-30 17:22:58 +02:00
assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>.");
2022-03-22 15:22:14 +01:00
// Text containing a match unordered and a match ordered.
let text = "The world split void void void void void void void void void split the world void void";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
let mut matcher = builder.build(&tokens[..], text);
// crop should return 10 last words with a marker at the start.
assert_eq!(
&matcher.format(highlight, crop),
2022-03-30 17:22:58 +02:00
"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"
2022-03-22 15:22:14 +01:00
);
}
#[test]
fn smaller_crop_size() {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
2022-04-04 18:56:59 +02:00
let matching_words = matching_words();
2022-04-04 18:56:59 +02:00
let mut builder = MatcherBuilder::from_matching_words(matching_words);
let analyzer = Analyzer::new(AnalyzerConfig::<Vec<u8>>::default());
let highlight = false;
let crop = true;
let text = "void void split the world void void.";
let analyzed = analyzer.analyze(&text);
let tokens: Vec<_> = analyzed.tokens().collect();
// set a smaller crop size
builder.crop_size(2);
let mut matcher = builder.build(&tokens[..], text);
// because crop size < query size, partially format matches.
2022-03-30 17:22:58 +02:00
assert_eq!(&matcher.format(highlight, crop), "…split the…");
// set a smaller crop size
builder.crop_size(1);
let mut matcher = builder.build(&tokens[..], text);
// because crop size < query size, partially format matches.
2022-03-30 17:22:58 +02:00
assert_eq!(&matcher.format(highlight, crop), "…split…");
// set a smaller crop size
builder.crop_size(0);
let mut matcher = builder.build(&tokens[..], text);
// because crop size is 0, crop is ignored.
assert_eq!(&matcher.format(highlight, crop), "void void split the world void void.");
}
2022-03-22 15:22:14 +01:00
}