diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs new file mode 100644 index 000000000..a47a08c68 --- /dev/null +++ b/milli/src/search/new/matches/matching_words.rs @@ -0,0 +1,334 @@ +use std::cmp::Reverse; +use std::ops::RangeInclusive; + +use charabia::Token; + +use super::super::interner::Interned; +use super::super::query_term::{ + Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, +}; +use super::super::{DedupInterner, Phrase}; +use crate::SearchContext; + +pub struct LocatedMatchingPhrase { + pub value: Interned, + pub positions: RangeInclusive, +} + +pub struct LocatedMatchingWords { + pub value: Vec>, + pub positions: RangeInclusive, + pub is_prefix: bool, +} + +/// Structure created from a query tree +/// referencing words that match the given query tree. +pub struct MatchingWords<'ctx> { + word_interner: &'ctx DedupInterner, + phrase_interner: &'ctx DedupInterner, + phrases: Vec, + words: Vec, +} + +/// Extract and centralize the different phrases and words to match stored in a QueryTerm. +fn extract_matching_terms(term: &QueryTerm) -> (Vec>, Vec>) { + let mut matching_words = Vec::new(); + let mut matching_phrases = Vec::new(); + + // the structure is exhaustively extracted to ensure that no field is missing. + let QueryTerm { + original: _, + is_multiple_words: _, + max_nbr_typos: _, + is_prefix: _, + zero_typo, + one_typo, + two_typo, + } = term; + + // the structure is exhaustively extracted to ensure that no field is missing. + let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo; + + // zero typo + if let Some(phrase) = phrase { + matching_phrases.push(*phrase); + } + if let Some(zero_typo) = zero_typo { + matching_words.push(*zero_typo); + } + for synonym in synonyms { + matching_phrases.push(*synonym); + } + + // one typo + // the structure is exhaustively extracted to ensure that no field is missing. + if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo { + if let Some(split_words) = split_words { + matching_phrases.push(*split_words); + } + for one_typo in one_typo { + matching_words.push(*one_typo); + } + } + + // two typos + // the structure is exhaustively extracted to ensure that no field is missing. + if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo { + for two_typos in two_typos { + matching_words.push(*two_typos); + } + } + + (matching_phrases, matching_words) +} + +impl<'ctx> MatchingWords<'ctx> { + pub fn new(ctx: &'ctx SearchContext, located_terms: Vec) -> Self { + let mut phrases = Vec::new(); + let mut words = Vec::new(); + + // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms + // and wrap them in dedicated structures. + for located_term in located_terms { + let term = ctx.term_interner.get(located_term.value); + let (matching_phrases, matching_words) = extract_matching_terms(term); + + for matching_phrase in matching_phrases { + phrases.push(LocatedMatchingPhrase { + value: matching_phrase, + positions: located_term.positions.clone(), + }); + } + words.push(LocatedMatchingWords { + value: matching_words, + positions: located_term.positions.clone(), + is_prefix: term.is_prefix, + }); + } + + // Sort word to put prefixes at the bottom prioritizing the exact matches. + words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len()))); + + Self { + phrases, + words, + word_interner: &ctx.word_interner, + phrase_interner: &ctx.phrase_interner, + } + } + + /// Returns an iterator over terms that match or partially match the given token. + pub fn match_token<'b>(&'ctx self, token: &'b Token<'b>) -> MatchesIter<'ctx, 'b> { + MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token } + } + + /// Try to match the token with one of the located_words. + fn match_unique_words(&'ctx self, token: &Token) -> Option> { + for located_words in &self.words { + for word in &located_words.value { + let word = self.word_interner.get(*word); + // if the word is a prefix we match using starts_with. + if located_words.is_prefix && token.lemma().starts_with(word) { + let char_len = token.original_lengths(word.len()).0; + let ids = &located_words.positions; + return Some(MatchType::Full { char_len, ids }); + // else we exact match the token. + } else if token.lemma() == word { + let char_len = token.char_end - token.char_start; + let ids = &located_words.positions; + return Some(MatchType::Full { char_len, ids }); + } + } + } + + None + } +} + +/// Iterator over terms that match the given token, +/// This allow to lazily evaluate matches. +pub struct MatchesIter<'a, 'b> { + matching_words: &'a MatchingWords<'a>, + phrases: Box + 'a>, + token: &'b Token<'b>, +} + +impl<'a> Iterator for MatchesIter<'a, '_> { + type Item = MatchType<'a>; + + fn next(&mut self) -> Option { + match self.phrases.next() { + // Try to match all the phrases first. + Some(located_phrase) => { + let phrase = self.matching_words.phrase_interner.get(located_phrase.value); + + // create a PartialMatch struct to make it compute the first match + // instead of duplicating the code. + let ids = &located_phrase.positions; + // collect the references of words from the interner. + let words = phrase + .words + .iter() + .map(|word| { + word.map(|word| self.matching_words.word_interner.get(word).as_str()) + }) + .collect(); + let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; + + partial.match_token(self.token).or_else(|| self.next()) + } + // If no phrases matches, try to match uiques words. + None => self.matching_words.match_unique_words(self.token), + } + } +} + +/// Id of a matching term corespounding to a word written by the end user. +pub type WordId = u16; + +/// A given token can partially match a query word for several reasons: +/// - split words +/// - multi-word synonyms +/// In these cases we need to match consecutively several tokens to consider that the match is full. +#[derive(Debug, PartialEq)] +pub enum MatchType<'a> { + Full { char_len: usize, ids: &'a RangeInclusive }, + Partial(PartialMatch<'a>), +} + +/// Structure helper to match several tokens in a row in order to complete a partial match. +#[derive(Debug, PartialEq)] +pub struct PartialMatch<'a> { + matching_words: Vec>, + ids: &'a RangeInclusive, + char_len: usize, +} + +impl<'a> PartialMatch<'a> { + /// Returns: + /// - None if the given token breaks the partial match + /// - Partial if the given token matches the partial match but doesn't complete it + /// - Full if the given token completes the partial match + pub fn match_token(self, token: &Token) -> Option> { + let Self { mut matching_words, ids, .. } = self; + + let is_matching = match matching_words.first()? { + Some(word) => &token.lemma() == word, + // a None value in the phrase corresponds to a stop word, + // the walue is considered a match if the current token is categorized as a stop word. + None => token.is_stopword(), + }; + + let char_len = token.char_end - token.char_start; + // if there are remaining words to match in the phrase and the current token is matching, + // return a new Partial match allowing the highlighter to continue. + if is_matching && matching_words.len() > 1 { + matching_words.remove(0); + Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) + // if there is no remaining word to match in the phrase and the current token is matching, + // return a Full match. + } else if is_matching { + Some(MatchType::Full { char_len, ids }) + // if the current token doesn't match, return None to break the match sequence. + } else { + None + } + } + + pub fn char_len(&self) -> usize { + self.char_len + } +} + +#[cfg(test)] +pub(crate) mod tests { + use std::borrow::Cow; + + use charabia::{TokenKind, TokenizerBuilder}; + + use super::super::super::located_query_terms_from_string; + use super::*; + use crate::index::tests::TempIndex; + + pub(crate) fn temp_index_with_documents() -> TempIndex { + let temp_index = TempIndex::new(); + temp_index + .add_documents(documents!([ + { "id": 1, "name": "split this world westfali westfalia the" }, + ])) + .unwrap(); + temp_index + } + + #[test] + fn matching_words() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let tokenizer = TokenizerBuilder::new().build(); + let tokens = tokenizer.tokenize("split this world"); + let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); + let matching_words = MatchingWords::new(&ctx, query_terms); + + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("split"), + char_end: "split".chars().count(), + byte_end: "split".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("nyc"), + char_end: "nyc".chars().count(), + byte_end: "nyc".len(), + ..Default::default() + }) + .next(), + None + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("world"), + char_end: "world".chars().count(), + byte_end: "world".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("worlded"), + char_end: "worlded".chars().count(), + byte_end: "worlded".len(), + ..Default::default() + }) + .next(), + Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + ); + assert_eq!( + matching_words + .match_token(&Token { + kind: TokenKind::Word, + lemma: Cow::Borrowed("thisnew"), + char_end: "thisnew".chars().count(), + byte_end: "thisnew".len(), + ..Default::default() + }) + .next(), + None + ); + } +} diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs new file mode 100644 index 000000000..33d0591a6 --- /dev/null +++ b/milli/src/search/new/matches/mod.rs @@ -0,0 +1,848 @@ +use std::borrow::Cow; + +use charabia::{SeparatorKind, Token, Tokenizer}; +use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; +use serde::Serialize; + +use super::query_term::LocatedQueryTerm; +use crate::SearchContext; + +pub mod matching_words; + +const DEFAULT_CROP_MARKER: &str = "…"; +const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; +const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; + +/// Structure used to build a Matcher allowing to customize formating tags. +pub struct MatcherBuilder<'a, 'ctx, A> { + matching_words: MatchingWords<'ctx>, + tokenizer: Tokenizer<'a, 'a, A>, + crop_marker: Option, + highlight_prefix: Option, + highlight_suffix: Option, +} + +impl<'a, 'ctx, A> MatcherBuilder<'a, 'ctx, A> { + pub fn new( + ctx: &'ctx SearchContext, + located_terms: Vec, + tokenizer: Tokenizer<'a, 'a, A>, + ) -> Self { + let matching_words = MatchingWords::new(ctx, located_terms); + Self { + matching_words, + tokenizer, + crop_marker: None, + highlight_prefix: None, + highlight_suffix: None, + } + } + + pub fn crop_marker(&mut self, marker: String) -> &Self { + self.crop_marker = Some(marker); + self + } + + pub fn highlight_prefix(&mut self, prefix: String) -> &Self { + self.highlight_prefix = Some(prefix); + self + } + + pub fn highlight_suffix(&mut self, suffix: String) -> &Self { + self.highlight_suffix = Some(suffix); + self + } + + pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { + let crop_marker = match &self.crop_marker { + Some(marker) => marker.as_str(), + None => DEFAULT_CROP_MARKER, + }; + + let highlight_prefix = match &self.highlight_prefix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_PREFIX, + }; + let highlight_suffix = match &self.highlight_suffix { + Some(marker) => marker.as_str(), + None => DEFAULT_HIGHLIGHT_SUFFIX, + }; + Matcher { + text, + matching_words: &self.matching_words, + tokenizer: &self.tokenizer, + crop_marker, + highlight_prefix, + highlight_suffix, + matches: None, + } + } +} + +#[derive(Copy, Clone, Default)] +pub struct FormatOptions { + pub highlight: bool, + pub crop: Option, +} + +impl FormatOptions { + pub fn merge(self, other: Self) -> Self { + Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } + } +} + +#[derive(Clone, Debug)] +pub struct Match { + match_len: usize, + // ids of the query words that matches. + ids: Vec, + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, +} + +#[derive(Serialize, Debug, Clone, PartialEq, Eq)] +pub struct MatchBounds { + pub start: usize, + pub length: usize, +} + +/// Structure used to analize a string, compute words that match, +/// and format the source string, returning a highlighted and cropped sub-string. +pub struct Matcher<'t, 'm, A> { + text: &'t str, + matching_words: &'m MatchingWords<'m>, + tokenizer: &'m Tokenizer<'m, 'm, A>, + crop_marker: &'m str, + highlight_prefix: &'m str, + highlight_suffix: &'m str, + matches: Option<(Vec>, Vec)>, +} + +impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { + /// Iterates over tokens and save any of them that matches the query. + fn compute_matches(&mut self) -> &mut Self { + /// some words are counted as matches only if they are close together and in the good order, + /// compute_partial_match peek into next words to validate if the match is complete. + fn compute_partial_match<'a>( + mut partial: PartialMatch, + token_position: usize, + word_position: usize, + words_positions: &mut impl Iterator)>, + matches: &mut Vec, + ) -> bool { + let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; + + for (token_position, word_position, word) in words_positions { + partial = match partial.match_token(word) { + // token matches the partial match, but the match is not full, + // we temporarly save the current token then we try to match the next one. + Some(MatchType::Partial(partial)) => { + potential_matches.push((token_position, word_position, partial.char_len())); + partial + } + // partial match is now full, we keep this matches and we advance positions + Some(MatchType::Full { char_len, ids }) => { + let ids: Vec<_> = ids.clone().into_iter().collect(); + // save previously matched tokens as matches. + let iter = potential_matches.into_iter().map( + |(token_position, word_position, match_len)| Match { + match_len, + ids: ids.clone(), + word_position, + token_position, + }, + ); + matches.extend(iter); + + // save the token that closes the partial match as a match. + matches.push(Match { + match_len: char_len, + ids, + word_position, + token_position, + }); + + // the match is complete, we return true. + return true; + } + // no match, continue to next match. + None => break, + }; + } + + // the match is not complete, we return false. + false + } + + let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); + let mut matches = Vec::new(); + + let mut words_positions = tokens + .iter() + .scan((0, 0), |(token_position, word_position), token| { + let current_token_position = *token_position; + let current_word_position = *word_position; + *token_position += 1; + if !token.is_separator() { + *word_position += 1; + } + + Some((current_token_position, current_word_position, token)) + }) + .filter(|(_, _, token)| !token.is_separator()); + + while let Some((token_position, word_position, word)) = words_positions.next() { + for match_type in self.matching_words.match_token(word) { + match match_type { + // we match, we save the current token as a match, + // then we continue the rest of the tokens. + MatchType::Full { char_len, ids } => { + let ids: Vec<_> = ids.clone().into_iter().collect(); + matches.push(Match { + match_len: char_len, + ids, + word_position, + token_position, + }); + break; + } + // we match partially, iterate over next tokens to check if we can complete the match. + MatchType::Partial(partial) => { + // if match is completed, we break the matching loop over the current token, + // then we continue the rest of the tokens. + let mut wp = words_positions.clone(); + if compute_partial_match( + partial, + token_position, + word_position, + &mut wp, + &mut matches, + ) { + words_positions = wp; + break; + } + } + } + } + } + + self.matches = Some((tokens, matches)); + self + } + + /// Returns boundaries of the words that match the query. + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some((tokens, matches)) => matches + .iter() + .map(|m| MatchBounds { + start: tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), + } + } + + /// Returns the bounds in byte index of the crop window. + fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { + // if there is no match, we start from the beginning of the string by default. + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + + // matches needs to be counted in the crop len. + let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); + // an iterator starting from the last match token position and going towards the end of the text. + let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token = before_tokens.peek().map(|t| t.separator_kind()); + let after_token = after_tokens.peek().map(|t| t.separator_kind()); + + match (before_token, after_token) { + // we can expand both sides. + (Some(before_token), Some(after_token)) => { + match (before_token, after_token) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + (Some(before_token_kind), Some(after_token_kind)) => { + if before_token_kind == after_token_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if before_token_kind == SeparatorKind::Hard { + after_tokens.next(); + } else { + before_tokens.next(); + } + } + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (None, Some(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (Some(_), None) => { + after_tokens.next(); + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0. + (None, None) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { + after_tokens.next(); + remaining_words -= 1; + } + } + } + } + // the end of the text is reached, advance left. + (Some(before_token), None) => { + before_tokens.next(); + if before_token.is_none() { + remaining_words -= 1; + } + } + // the start of the text is reached, advance right. + (None, Some(after_token)) => { + after_tokens.next(); + if after_token.is_none() { + remaining_words -= 1; + } + } + // no more token to add. + (None, None) => break, + } + } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) + } + + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches + fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + // compute distance between matches + distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + (uniq_score, distance_score, order_score) + } + + /// Returns the matches interval where the score computed by match_interval_score is the best. + fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { + // we compute the matches interval if we have at least 2 matches. + if matches.len() > 1 { + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval = (0, 0); + let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. + let mut interval_first = 0; + let mut interval_last = 0; + for (index, next_match) in matches.iter().enumerate().skip(1) { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + if next_match.word_position - matches[interval_first].word_position >= crop_size { + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } + + // advance start of the interval while interval is longer than crop_size. + while next_match.word_position - matches[interval_first].word_position + >= crop_size + { + interval_first += 1; + } + } + interval_last = index; + } + + // compute the last interval score and compare it to the best one. + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + } + + &matches[best_interval.0..=best_interval.1] + } else { + matches + } + } + + // Returns the formatted version of the original text. + pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { + if !format_options.highlight && format_options.crop.is_none() { + // compute matches is not needed if no highlight nor crop is requested. + Cow::Borrowed(self.text) + } else { + match &self.matches { + Some((tokens, matches)) => { + // If the text has to be cropped, + // compute the best interval to crop around. + let matches = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.find_best_match_interval(matches, crop_size) + } + _ => matches, + }; + + // If the text has to be cropped, + // crop around the best interval. + let (byte_start, byte_end) = match format_options.crop { + Some(crop_size) if crop_size > 0 => { + self.crop_bounds(tokens, matches, crop_size) + } + _ => (0, self.text.len()), + }; + + let mut formatted = Vec::new(); + + // push crop marker if it's not the start of the text. + if byte_start > 0 && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + let mut byte_index = byte_start; + + if format_options.highlight { + // insert highlight markers around matches. + for m in matches { + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); + } + + let highlight_byte_index = self.text[token.byte_start..] + .char_indices() + .enumerate() + .find(|(i, _)| *i == m.match_len) + .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); + formatted.push(&self.text[token.byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. + if highlight_byte_index < token.byte_end { + formatted.push(&self.text[highlight_byte_index..token.byte_end]); + } + + byte_index = token.byte_end; + } + } + + // push the rest of the text between last match and the end of crop. + if byte_index < byte_end { + formatted.push(&self.text[byte_index..byte_end]); + } + + // push crop marker if it's not the end of the text. + if byte_end < self.text.len() && !self.crop_marker.is_empty() { + formatted.push(self.crop_marker); + } + + if formatted.len() == 1 { + // avoid concatenating if there is already 1 slice. + Cow::Borrowed(&self.text[byte_start..byte_end]) + } else { + Cow::Owned(formatted.concat()) + } + } + None => self.compute_matches().format(format_options), + } + } + } +} + +#[cfg(test)] +mod tests { + use charabia::TokenizerBuilder; + use matching_words::tests::temp_index_with_documents; + + use super::super::located_query_terms_from_string; + use super::*; + + impl<'a, 'ctx> MatcherBuilder<'a, 'ctx, &[u8]> { + pub fn new_test(ctx: &'ctx mut SearchContext, query: &'a str) -> Self { + let tokenizer = TokenizerBuilder::new().build(); + let tokens = tokenizer.tokenize(query); + let query_terms = located_query_terms_from_string(ctx, tokens, None).unwrap(); + Self::new(ctx, query_terms, TokenizerBuilder::new().build()) + } + } + + #[test] + fn format_identity() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: false, crop: None }; + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop and no highlight should return complete text. + assert_eq!(&matcher.format(format_options), &text); + } + + #[test] + fn format_highlight() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: true, crop: None }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ""); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + assert_eq!(&matcher.format(format_options), ":-)"); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no crop should return complete text, because there is no matches. + assert_eq!(&matcher.format(format_options), &text); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Natalie risk her future to build a world with the boy she loves." + ); + } + + #[test] + fn highlight_unicode() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "world"); + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing prefix match. + let text = "Ŵôřlḑôle"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑôle" + ); + + // Text containing unicode match. + let text = "Ŵôřlḑ"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Ŵôřlḑ" + ); + + let builder = MatcherBuilder::new_test(&mut ctx, "westfali"); + let format_options = FormatOptions { highlight: true, crop: None }; + + // Text containing unicode match. + let text = "Westfália"; + let mut matcher = builder.build(text); + // no crop should return complete text with highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"Westfália" + ); + } + + #[test] + fn format_crop() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: false, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text without any match starting by a separator. + let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; + let mut matcher = builder.build(text); + // no highlight should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"(A quick brown fox can not jump 32 feet, right…" + ); + + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; + let mut matcher = builder.build(text); + // should crop the phrase instead of croping around the match. + insta::assert_snapshot!( + matcher.format(format_options), + @"… Split The World is a book written by Emily Henry…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // no highlight should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with diferent density. + let text = "split void the void void world void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + + // Text containing matches with same word. + let text = "split split split split split split void void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn format_highlight_crop() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let format_options = FormatOptions { highlight: true, crop: Some(10) }; + + // empty text. + let text = ""; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"" + ); + + // text containing only separators. + let text = ":-)"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @":-)" + ); + + // Text without any match. + let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; + let mut matcher = builder.build(text); + // both should return 10 first words with a marker at the end. + insta::assert_snapshot!( + matcher.format(format_options), + @"A quick brown fox can not jump 32 feet, right…" + ); + + // Text containing some matches. + let text = "Natalie risk her future to build a world with the boy she loves."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…future to build a world with the boy she loves…" + ); + + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let mut matcher = builder.build(text); + // both should return 10 last words with a marker at the start and highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…she loves. Emily Henry: The Love That Split The World." + ); + + // Text containing a match unordered and a match ordered. + let text = "The world split void void void void void void void void void split the world void void"; + let mut matcher = builder.build(text); + // crop should return 10 last words with a marker at the start. + insta::assert_snapshot!( + matcher.format(format_options), + @"…void void void void void split the world void void" + ); + } + + #[test] + fn smaller_crop_size() { + //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let builder = MatcherBuilder::new_test(&mut ctx, "split the world"); + + let text = "void void split the world void void."; + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(2) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split the…" + ); + + // set a smaller crop size + let format_options = FormatOptions { highlight: false, crop: Some(1) }; + let mut matcher = builder.build(text); + // because crop size < query size, partially format matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…split…" + ); + + // set crop size to 0 + let format_options = FormatOptions { highlight: false, crop: Some(0) }; + let mut matcher = builder.build(text); + // because crop size is 0, crop is ignored. + insta::assert_snapshot!( + matcher.format(format_options), + @"void void split the world void void." + ); + } + + #[test] + fn partial_matches() { + let temp_index = temp_index_with_documents(); + let rtxn = temp_index.read_txn().unwrap(); + let mut ctx = SearchContext::new(&temp_index, &rtxn); + let mut builder = MatcherBuilder::new_test(&mut ctx, "the \"t he\" door \"do or\""); + builder.highlight_prefix("_".to_string()); + builder.highlight_suffix("_".to_string()); + + let format_options = FormatOptions { highlight: true, crop: None }; + + let text = "the do or die can't be he do and or isn't he"; + let mut matcher = builder.build(text); + insta::assert_snapshot!( + matcher.format(format_options), + @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + ); + } +} diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 4d561d25b..ef7e61ee1 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -4,6 +4,7 @@ mod graph_based_ranking_rule; mod interner; mod limits; mod logger; +mod matches; mod query_graph; mod query_term; mod ranking_rule_graph;