diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs deleted file mode 100644 index 5ccf0286f..000000000 --- a/milli/src/search/matches/matching_words.rs +++ /dev/null @@ -1,457 +0,0 @@ -use std::cmp::{min, Reverse}; -use std::collections::BTreeMap; -use std::fmt; -use std::ops::{Index, IndexMut}; -use std::rc::Rc; - -use charabia::Token; -use levenshtein_automata::{Distance, DFA}; - -use crate::error::InternalError; -use crate::search::build_dfa; -use crate::MAX_WORD_LENGTH; - -type IsPrefix = bool; - -/// Structure created from a query tree -/// referencing words that match the given query tree. -#[derive(Default)] -pub struct MatchingWords { - inner: Vec<(Vec>, Vec)>, -} - -impl fmt::Debug for MatchingWords { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - writeln!(f, "[")?; - for (matching_words, primitive_word_id) in self.inner.iter() { - writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?; - } - writeln!(f, "]")?; - Ok(()) - } -} - -impl MatchingWords { - pub fn new( - mut matching_words: Vec<(Vec>, Vec)>, - ) -> crate::Result { - // if one of the matching_words vec doesn't contain a word. - if matching_words.iter().any(|(mw, _)| mw.is_empty()) { - return Err(InternalError::InvalidMatchingWords.into()); - } - - // Sort word by len in DESC order prioritizing the longuest matches, - // in order to highlight the longuest part of the matched word. - matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len()))); - - Ok(Self { inner: matching_words }) - } - - /// Returns an iterator over terms that match or partially match the given token. - pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> { - MatchesIter { inner: Box::new(self.inner.iter()), token } - } -} - -/// Iterator over terms that match the given token, -/// This allow to lazily evaluate matches. -pub struct MatchesIter<'a, 'b> { - #[allow(clippy::type_complexity)] - inner: Box>, Vec)> + 'a>, - token: &'b Token<'b>, -} - -impl<'a> Iterator for MatchesIter<'a, '_> { - type Item = MatchType<'a>; - - fn next(&mut self) -> Option { - match self.inner.next() { - Some((matching_words, ids)) => match matching_words[0].match_token(self.token) { - Some(char_len) => { - if matching_words.len() > 1 { - Some(MatchType::Partial(PartialMatch { - matching_words: &matching_words[1..], - ids, - char_len, - })) - } else { - Some(MatchType::Full { char_len, ids }) - } - } - None => self.next(), - }, - None => None, - } - } -} - -/// Id of a matching term corespounding to a word written by the end user. -pub type PrimitiveWordId = u8; - -/// Structure used to match a specific term. -pub struct MatchingWord { - pub dfa: DFA, - pub word: String, - pub typo: u8, - pub prefix: IsPrefix, -} - -impl fmt::Debug for MatchingWord { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("MatchingWord") - .field("word", &self.word) - .field("typo", &self.typo) - .field("prefix", &self.prefix) - .finish() - } -} - -impl PartialEq for MatchingWord { - fn eq(&self, other: &Self) -> bool { - self.prefix == other.prefix && self.typo == other.typo && self.word == other.word - } -} - -impl MatchingWord { - pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option { - if word.len() > MAX_WORD_LENGTH { - return None; - } - let dfa = build_dfa(&word, typo, prefix); - - Some(Self { dfa, word, typo, prefix }) - } - - /// Returns the lenght in chars of the match in case of the token matches the term. - pub fn match_token(&self, token: &Token) -> Option { - match self.dfa.eval(token.lemma()) { - Distance::Exact(t) if t <= self.typo => { - if self.prefix { - let len = bytes_to_highlight(token.lemma(), &self.word); - Some(token.original_lengths(len).0) - } else { - Some(token.original_lengths(token.lemma().len()).0) - } - } - _otherwise => None, - } - } -} - -/// A given token can partially match a query word for several reasons: -/// - split words -/// - multi-word synonyms -/// In these cases we need to match consecutively several tokens to consider that the match is full. -#[derive(Debug, PartialEq)] -pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a [PrimitiveWordId] }, - Partial(PartialMatch<'a>), -} - -/// Structure helper to match several tokens in a row in order to complete a partial match. -#[derive(Debug, PartialEq)] -pub struct PartialMatch<'a> { - matching_words: &'a [Rc], - ids: &'a [PrimitiveWordId], - char_len: usize, -} - -impl<'a> PartialMatch<'a> { - /// Returns: - /// - None if the given token breaks the partial match - /// - Partial if the given token matches the partial match but doesn't complete it - /// - Full if the given token completes the partial match - pub fn match_token(self, token: &Token) -> Option> { - self.matching_words[0].match_token(token).map(|char_len| { - if self.matching_words.len() > 1 { - MatchType::Partial(PartialMatch { - matching_words: &self.matching_words[1..], - ids: self.ids, - char_len, - }) - } else { - MatchType::Full { char_len, ids: self.ids } - } - }) - } - - pub fn char_len(&self) -> usize { - self.char_len - } -} - -// A simple wrapper around vec so we can get contiguous but index it like it's 2D array. -struct N2Array { - y_size: usize, - buf: Vec, -} - -impl N2Array { - fn new(x: usize, y: usize, value: T) -> N2Array { - N2Array { y_size: y, buf: vec![value; x * y] } - } -} - -impl Index<(usize, usize)> for N2Array { - type Output = T; - - #[inline] - fn index(&self, (x, y): (usize, usize)) -> &T { - &self.buf[(x * self.y_size) + y] - } -} - -impl IndexMut<(usize, usize)> for N2Array { - #[inline] - fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T { - &mut self.buf[(x * self.y_size) + y] - } -} - -/// Returns the number of **bytes** we want to highlight in the `source` word. -/// Basically we want to highlight as much characters as possible in the source until it has too much -/// typos (= 2) -/// The algorithm is a modified -/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) -fn bytes_to_highlight(source: &str, target: &str) -> usize { - let n = source.chars().count(); - let m = target.chars().count(); - - if n == 0 { - return 0; - } - // since we allow two typos we can send two characters even if it's completely wrong - if m < 3 { - return source.chars().take(m).map(|c| c.len_utf8()).sum(); - } - if n == m && source == target { - return source.len(); - } - - let inf = n + m; - let mut matrix = N2Array::new(n + 2, m + 2, 0); - - matrix[(0, 0)] = inf; - for i in 0..=n { - matrix[(i + 1, 0)] = inf; - matrix[(i + 1, 1)] = i; - } - for j in 0..=m { - matrix[(0, j + 1)] = inf; - matrix[(1, j + 1)] = j; - } - - let mut last_row = BTreeMap::new(); - - for (row, char_s) in source.chars().enumerate() { - let mut last_match_col = 0; - let row = row + 1; - - for (col, char_t) in target.chars().enumerate() { - let col = col + 1; - let last_match_row = *last_row.get(&char_t).unwrap_or(&0); - let cost = usize::from(char_s != char_t); - - let dist_add = matrix[(row, col + 1)] + 1; - let dist_del = matrix[(row + 1, col)] + 1; - let dist_sub = matrix[(row, col)] + cost; - let dist_trans = matrix[(last_match_row, last_match_col)] - + (row - last_match_row - 1) - + 1 - + (col - last_match_col - 1); - let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans)); - matrix[(row + 1, col + 1)] = dist; - - if cost == 0 { - last_match_col = col; - } - } - - last_row.insert(char_s, row); - } - - let mut minimum = (u32::max_value(), 0); - for x in 0..=m { - let dist = matrix[(n + 1, x + 1)] as u32; - if dist < minimum.0 { - minimum = (dist, x); - } - } - - // everything was done characters wise and now we want to returns a number of bytes - source.chars().take(minimum.1).map(|c| c.len_utf8()).sum() -} - -#[cfg(test)] -mod tests { - use std::borrow::Cow; - use std::str::from_utf8; - - use charabia::TokenKind; - - use super::{MatchingWords, *}; - - #[test] - fn test_bytes_to_highlight() { - struct TestBytesToHighlight { - query: &'static str, - text: &'static str, - length: usize, - } - let tests = [ - TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() }, - TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() }, - TestBytesToHighlight { - query: "Levenshtein", - text: "Levenshtein", - length: "Levenshtein".len(), - }, - // we get to the end of our word with only one typo - TestBytesToHighlight { - query: "Levenste", - text: "Levenshtein", - length: "Levenste".len(), - }, - // we get our third and last authorized typo right on the last character - TestBytesToHighlight { - query: "Levenstein", - text: "Levenshte", - length: "Levenste".len(), - }, - // we get to the end of our word with only two typos at the beginning - TestBytesToHighlight { - query: "Bavenshtein", - text: "Levenshtein", - length: "Bavenshtein".len(), - }, - TestBytesToHighlight { - query: "Альфа", text: "Альфой", length: "Альф".len() - }, - TestBytesToHighlight { - query: "Go💼", text: "Go💼od luck.", length: "Go💼".len() - }, - TestBytesToHighlight { - query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len() - }, - TestBytesToHighlight { - query: "chäräcters", - text: "chäräcters", - length: "chäräcters".len(), - }, - TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() }, - TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() }, - ]; - - for test in &tests { - let length = bytes_to_highlight(test.text, test.query); - assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text); - assert!( - from_utf8(&test.query.as_bytes()[..length]).is_ok(), - r#"converting {}[..{}] to an utf8 str failed"#, - test.query, - length - ); - } - } - - #[test] - fn matching_words() { - let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()), - Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone()], vec![1]), - (vec![all[2].clone()], vec![2]), - ]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("word"), - char_end: "word".chars().count(), - byte_end: "word".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 3, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("nyc"), - char_end: "nyc".chars().count(), - byte_end: "nyc".len(), - ..Default::default() - }) - .next(), - None - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("world"), - char_end: "world".chars().count(), - byte_end: "world".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("splitted"), - char_end: "splitted".chars().count(), - byte_end: "splitted".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[0] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("thisnew"), - char_end: "thisnew".chars().count(), - byte_end: "thisnew".len(), - ..Default::default() - }) - .next(), - None - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("borld"), - char_end: "borld".chars().count(), - byte_end: "borld".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 5, ids: &[2] }) - ); - assert_eq!( - matching_words - .match_token(&Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("wordsplit"), - char_end: "wordsplit".chars().count(), - byte_end: "wordsplit".len(), - ..Default::default() - }) - .next(), - Some(MatchType::Full { char_len: 4, ids: &[2] }) - ); - } -} diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs deleted file mode 100644 index c634ae297..000000000 --- a/milli/src/search/matches/mod.rs +++ /dev/null @@ -1,865 +0,0 @@ -use std::borrow::Cow; - -use charabia::{SeparatorKind, Token, Tokenizer}; -use matching_words::{MatchType, PartialMatch, PrimitiveWordId}; -pub use matching_words::{MatchingWord, MatchingWords}; -use serde::Serialize; - -pub mod matching_words; - -const DEFAULT_CROP_MARKER: &str = "…"; -const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; -const DEFAULT_HIGHLIGHT_SUFFIX: &str = ""; - -/// Structure used to build a Matcher allowing to customize formating tags. -pub struct MatcherBuilder<'a, A> { - matching_words: MatchingWords, - tokenizer: Tokenizer<'a, 'a, A>, - crop_marker: Option, - highlight_prefix: Option, - highlight_suffix: Option, -} - -impl<'a, A> MatcherBuilder<'a, A> { - pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { - Self { - matching_words, - tokenizer, - crop_marker: None, - highlight_prefix: None, - highlight_suffix: None, - } - } - - pub fn crop_marker(&mut self, marker: String) -> &Self { - self.crop_marker = Some(marker); - self - } - - pub fn highlight_prefix(&mut self, prefix: String) -> &Self { - self.highlight_prefix = Some(prefix); - self - } - - pub fn highlight_suffix(&mut self, suffix: String) -> &Self { - self.highlight_suffix = Some(suffix); - self - } - - pub fn build<'t, 'm>(&'m self, text: &'t str) -> Matcher<'t, 'm, A> { - let crop_marker = match &self.crop_marker { - Some(marker) => marker.as_str(), - None => DEFAULT_CROP_MARKER, - }; - - let highlight_prefix = match &self.highlight_prefix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_PREFIX, - }; - let highlight_suffix = match &self.highlight_suffix { - Some(marker) => marker.as_str(), - None => DEFAULT_HIGHLIGHT_SUFFIX, - }; - Matcher { - text, - matching_words: &self.matching_words, - tokenizer: &self.tokenizer, - crop_marker, - highlight_prefix, - highlight_suffix, - matches: None, - } - } -} - -#[derive(Copy, Clone, Default)] -pub struct FormatOptions { - pub highlight: bool, - pub crop: Option, -} - -impl FormatOptions { - pub fn merge(self, other: Self) -> Self { - Self { highlight: self.highlight || other.highlight, crop: self.crop.or(other.crop) } - } -} - -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, -} - -#[derive(Serialize, Debug, Clone, PartialEq, Eq)] -pub struct MatchBounds { - pub start: usize, - pub length: usize, -} - -/// Structure used to analize a string, compute words that match, -/// and format the source string, returning a highlighted and cropped sub-string. -pub struct Matcher<'t, 'm, A> { - text: &'t str, - matching_words: &'m MatchingWords, - tokenizer: &'m Tokenizer<'m, 'm, A>, - crop_marker: &'m str, - highlight_prefix: &'m str, - highlight_suffix: &'m str, - matches: Option<(Vec>, Vec)>, -} - -impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { - /// Iterates over tokens and save any of them that matches the query. - fn compute_matches(&mut self) -> &mut Self { - /// some words are counted as matches only if they are close together and in the good order, - /// compute_partial_match peek into next words to validate if the match is complete. - fn compute_partial_match<'a>( - mut partial: PartialMatch, - token_position: usize, - word_position: usize, - words_positions: &mut impl Iterator)>, - matches: &mut Vec, - ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - - for (token_position, word_position, word) in words_positions { - partial = match partial.match_token(word) { - // token matches the partial match, but the match is not full, - // we temporarly save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } - // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.to_vec(), - word_position, - token_position, - }, - ); - matches.extend(iter); - - // save the token that closes the partial match as a match. - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - - // the match is complete, we return true. - return true; - } - // no match, continue to next match. - None => break, - }; - } - - // the match is not complete, we return false. - false - } - - let tokens: Vec<_> = self.tokenizer.tokenize(self.text).collect(); - let mut matches = Vec::new(); - - let mut words_positions = tokens - .iter() - .scan((0, 0), |(token_position, word_position), token| { - let current_token_position = *token_position; - let current_word_position = *word_position; - *token_position += 1; - if !token.is_separator() { - *word_position += 1; - } - - Some((current_token_position, current_word_position, token)) - }) - .filter(|(_, _, token)| !token.is_separator()); - - while let Some((token_position, word_position, word)) = words_positions.next() { - for match_type in self.matching_words.match_token(word) { - match match_type { - // we match, we save the current token as a match, - // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { - matches.push(Match { - match_len: char_len, - ids: ids.to_vec(), - word_position, - token_position, - }); - break; - } - // we match partially, iterate over next tokens to check if we can complete the match. - MatchType::Partial(partial) => { - // if match is completed, we break the matching loop over the current token, - // then we continue the rest of the tokens. - let mut wp = words_positions.clone(); - if compute_partial_match( - partial, - token_position, - word_position, - &mut wp, - &mut matches, - ) { - words_positions = wp; - break; - } - } - } - } - } - - self.matches = Some((tokens, matches)); - self - } - - /// Returns boundaries of the words that match the query. - pub fn matches(&mut self) -> Vec { - match &self.matches { - None => self.compute_matches().matches(), - Some((tokens, matches)) => matches - .iter() - .map(|m| MatchBounds { - start: tokens[m.token_position].byte_start, - length: m.match_len, - }) - .collect(), - } - } - - /// Returns the bounds in byte index of the crop window. - fn crop_bounds(&self, tokens: &[Token], matches: &[Match], crop_size: usize) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); - let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; - - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); - - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.separator_kind()); - let after_token = after_tokens.peek().map(|t| t.separator_kind()); - - match (before_token, after_token) { - // we can expand both sides. - (Some(before_token), Some(after_token)) => { - match (before_token, after_token) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - (Some(before_token_kind), Some(after_token_kind)) => { - if before_token_kind == after_token_kind { - before_tokens.next(); - - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { - after_tokens.next(); - } - } else if before_token_kind == SeparatorKind::Hard { - after_tokens.next(); - } else { - before_tokens.next(); - } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (None, Some(_)) => { - before_tokens.next(); - remaining_words -= 1; - } - // right is a word, advance right. - (Some(_), None) => { - after_tokens.next(); - remaining_words -= 1; - } - // both are words, advance left then right if remaining_word > 0. - (None, None) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { - after_tokens.next(); - remaining_words -= 1; - } - } - } - } - // the end of the text is reached, advance left. - (Some(before_token), None) => { - before_tokens.next(); - if before_token.is_none() { - remaining_words -= 1; - } - } - // the start of the text is reached, advance right. - (None, Some(after_token)) => { - after_tokens.next(); - if after_token.is_none() { - remaining_words -= 1; - } - } - // no more token to add. - (None, None) => break, - } - } - - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - - (crop_byte_start, crop_byte_end) - } - - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - // compute distance between matches - distance_score -= (next_match.word_position - m.word_position).min(7) as i16; - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - (uniq_score, distance_score, order_score) - } - - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { - // we compute the matches interval if we have at least 2 matches. - if matches.len() > 1 { - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval = (0, 0); - let mut best_interval_score = self.match_interval_score(&matches[0..=0]); - // current interval positions. - let mut interval_first = 0; - let mut interval_last = 0; - for (index, next_match) in matches.iter().enumerate().skip(1) { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position >= crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; - } - - // advance start of the interval while interval is longer than crop_size. - while next_match.word_position - matches[interval_first].word_position - >= crop_size - { - interval_first += 1; - } - } - interval_last = index; - } - - // compute the last interval score and compare it to the best one. - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - } - - &matches[best_interval.0..=best_interval.1] - } else { - matches - } - } - - // Returns the formatted version of the original text. - pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { - if !format_options.highlight && format_options.crop.is_none() { - // compute matches is not needed if no highlight nor crop is requested. - Cow::Borrowed(self.text) - } else { - match &self.matches { - Some((tokens, matches)) => { - // If the text has to be cropped, - // compute the best interval to crop around. - let matches = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.find_best_match_interval(matches, crop_size) - } - _ => matches, - }; - - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { - Some(crop_size) if crop_size > 0 => { - self.crop_bounds(tokens, matches, crop_size) - } - _ => (0, self.text.len()), - }; - - let mut formatted = Vec::new(); - - // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - let mut byte_index = byte_start; - - if format_options.highlight { - // insert highlight markers around matches. - for m in matches { - let token = &tokens[m.token_position]; - - if byte_index < token.byte_start { - formatted.push(&self.text[byte_index..token.byte_start]); - } - - let highlight_byte_index = self.text[token.byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); - formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..highlight_byte_index]); - formatted.push(self.highlight_suffix); - // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < token.byte_end { - formatted.push(&self.text[highlight_byte_index..token.byte_end]); - } - - byte_index = token.byte_end; - } - } - - // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); - } - - // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { - formatted.push(self.crop_marker); - } - - if formatted.len() == 1 { - // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) - } else { - Cow::Owned(formatted.concat()) - } - } - None => self.compute_matches().format(format_options), - } - } - } -} - -#[cfg(test)] -mod tests { - use std::rc::Rc; - - use charabia::TokenizerBuilder; - - use super::*; - use crate::search::matches::matching_words::MatchingWord; - - fn matching_words() -> MatchingWords { - let all = vec![ - Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone()], vec![1]), - (vec![all[2].clone()], vec![2]), - ]; - - MatchingWords::new(matching_words).unwrap() - } - - impl MatcherBuilder<'_, Vec> { - pub fn from_matching_words(matching_words: MatchingWords) -> Self { - Self::new(matching_words, TokenizerBuilder::default().build()) - } - } - - #[test] - fn format_identity() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: false, crop: None }; - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no crop and no highlight should return complete text. - assert_eq!(&matcher.format(format_options), &text); - } - - #[test] - fn format_highlight() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: None }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ""); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - assert_eq!(&matcher.format(format_options), ":-)"); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no crop should return complete text, because there is no matches. - assert_eq!(&matcher.format(format_options), &text); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Natalie risk her future to build a world with the boy she loves." - ); - } - - #[test] - fn highlight_unicode() { - let all = vec![ - Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()), - Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()), - ]; - let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: None }; - - // Text containing prefix match. - let text = "Ŵôřlḑôle"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑôle" - ); - - // Text containing unicode match. - let text = "Ŵôřlḑ"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Ŵôřlḑ" - ); - - // Text containing unicode match. - let text = "Westfália"; - let mut matcher = builder.build(text); - // no crop should return complete text with highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"Westfália" - ); - } - - #[test] - fn format_crop() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: false, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text without any match starting by a separator. - let text = "(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)"; - let mut matcher = builder.build(text); - // no highlight should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"(A quick brown fox can not jump 32 feet, right…" - ); - - // Test phrase propagation - let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; - let mut matcher = builder.build(text); - // should crop the phrase instead of croping around the match. - insta::assert_snapshot!( - matcher.format(format_options), - @"… Split The World is a book written by Emily Henry…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // no highlight should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with diferent density. - let text = "split void the void void world void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - - // Text containing matches with same word. - let text = "split split split split split split void void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn format_highlight_crop() { - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let format_options = FormatOptions { highlight: true, crop: Some(10) }; - - // empty text. - let text = ""; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"" - ); - - // text containing only separators. - let text = ":-)"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @":-)" - ); - - // Text without any match. - let text = "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"; - let mut matcher = builder.build(text); - // both should return 10 first words with a marker at the end. - insta::assert_snapshot!( - matcher.format(format_options), - @"A quick brown fox can not jump 32 feet, right…" - ); - - // Text containing some matches. - let text = "Natalie risk her future to build a world with the boy she loves."; - let mut matcher = builder.build(text); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…future to build a world with the boy she loves…" - ); - - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; - let mut matcher = builder.build(text); - // both should return 10 last words with a marker at the start and highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…she loves. Emily Henry: The Love That Split The World." - ); - - // Text containing a match unordered and a match ordered. - let text = "The world split void void void void void void void void void split the world void void"; - let mut matcher = builder.build(text); - // crop should return 10 last words with a marker at the start. - insta::assert_snapshot!( - matcher.format(format_options), - @"…void void void void void split the world void void" - ); - } - - #[test] - fn smaller_crop_size() { - //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let matching_words = matching_words(); - - let builder = MatcherBuilder::from_matching_words(matching_words); - - let text = "void void split the world void void."; - - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(2) }; - let mut matcher = builder.build(text); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split the…" - ); - - // set a smaller crop size - let format_options = FormatOptions { highlight: false, crop: Some(1) }; - let mut matcher = builder.build(text); - // because crop size < query size, partially format matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…split…" - ); - - // set crop size to 0 - let format_options = FormatOptions { highlight: false, crop: Some(0) }; - let mut matcher = builder.build(text); - // because crop size is 0, crop is ignored. - insta::assert_snapshot!( - matcher.format(format_options), - @"void void split the world void void." - ); - } - - #[test] - fn partial_matches() { - let all = vec![ - Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()), - Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()), - ]; - let matching_words = vec![ - (vec![all[0].clone()], vec![0]), - (vec![all[1].clone(), all[2].clone()], vec![0]), - (vec![all[3].clone()], vec![1]), - (vec![all[4].clone(), all[5].clone()], vec![1]), - (vec![all[4].clone()], vec![2]), - ]; - - let matching_words = MatchingWords::new(matching_words).unwrap(); - - let mut builder = MatcherBuilder::from_matching_words(matching_words); - builder.highlight_prefix("_".to_string()); - builder.highlight_suffix("_".to_string()); - - let format_options = FormatOptions { highlight: true, crop: None }; - - let text = "the do or die can't be he do and or isn't he"; - let mut matcher = builder.build(text); - insta::assert_snapshot!( - matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_" - ); - } -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 3683a5cf0..3e372e551 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -17,7 +17,6 @@ static LEVDIST2: Lazy = Lazy::new(|| LevBuilder::new(2, true)); pub mod facet; mod fst_utils; -mod matches; pub mod new; pub struct Search<'a> {