diff --git a/crates/milli/src/search/new/matches/best_match_interval.rs b/crates/milli/src/search/new/matches/best_match_interval.rs new file mode 100644 index 000000000..a6497f351 --- /dev/null +++ b/crates/milli/src/search/new/matches/best_match_interval.rs @@ -0,0 +1,139 @@ +use super::matching_words::WordId; +use super::{Match, MatchPosition}; + +struct MatchIntervalWithScore { + interval: [usize; 2], + score: [i16; 3], +} + +// count score for phrases +fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; +} + +/// Compute the score of a match interval: +/// 1) count unique matches +/// 2) calculate distance between matches +/// 3) count ordered matches +fn get_interval_score(matches: &[Match]) -> [i16; 3] { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + [uniq_score, distance_score, order_score] +} + +/// Returns the first and last match where the score computed by match_interval_score is the best. +pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] { + if matches.is_empty() { + panic!("`matches` should not be empty at this point"); + } + + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval: Option = None; + + let mut save_best_interval = |interval_first, interval_last| { + let interval_score = get_interval_score(&matches[interval_first..=interval_last]); + let is_interval_score_better = &best_interval + .as_ref() + .map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score); + + if *is_interval_score_better { + best_interval = Some(MatchIntervalWithScore { + interval: [interval_first, interval_last], + score: interval_score, + }); + } + }; + + // we compute the matches interval if we have at least 2 matches. + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + // keep interval if it's the best + save_best_interval(interval_first, interval_last); + } + + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + if interval_first == matches.len() { + interval_first -= 1; + break; + } + + interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches.len() - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { + save_best_interval(interval_first, interval_last); + } + + // if none of the matches fit the criteria above, default to the first one + best_interval.map_or( + [&matches[0], &matches[0]], + |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]], + ) +} diff --git a/crates/milli/src/search/new/matches/match.rs b/crates/milli/src/search/new/matches/match.rs new file mode 100644 index 000000000..2eef4d5a6 --- /dev/null +++ b/crates/milli/src/search/new/matches/match.rs @@ -0,0 +1,62 @@ +use super::matching_words::WordId; + +#[derive(Clone, Debug)] +pub enum MatchPosition { + Word { + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, + }, + Phrase { + // position of the first and last word in the phrase in the whole text. + word_positions: [usize; 2], + // position of the first and last token in the phrase in the whole text. + token_positions: [usize; 2], + }, +} + +#[derive(Clone, Debug)] +pub struct Match { + pub char_count: usize, + // ids of the query words that matches. + pub ids: Vec, + pub position: MatchPosition, +} + +impl Match { + pub(super) fn get_first_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp, + } + } + + pub(super) fn get_last_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp, + } + } + + pub(super) fn get_first_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp, + } + } + + pub(super) fn get_last_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp, + } + } + + pub(super) fn get_word_count(&self) -> usize { + match self.position { + MatchPosition::Word { .. } => 1, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1, + } + } +} diff --git a/crates/milli/src/search/new/matches/matching_words.rs b/crates/milli/src/search/new/matches/matching_words.rs index 4ad5c37ec..1f30a17ad 100644 --- a/crates/milli/src/search/new/matches/matching_words.rs +++ b/crates/milli/src/search/new/matches/matching_words.rs @@ -86,14 +86,17 @@ impl MatchingWords { continue; }; let prefix_length = char_index + c.len_utf8(); - let char_len = token.original_lengths(prefix_length).0; + let (char_count, byte_len) = token.original_lengths(prefix_length); let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { ids, char_count, byte_len }); // else we exact match the token. } else if token.lemma() == word { - let char_len = token.char_end - token.char_start; let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }); } } } @@ -130,7 +133,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> { word.map(|word| self.matching_words.word_interner.get(word).as_str()) }) .collect(); - let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; + let partial = PartialMatch { matching_words: words, ids }; partial.match_token(self.token).or_else(|| self.next()) } @@ -149,7 +152,7 @@ pub type WordId = u16; /// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a RangeInclusive }, + Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive }, Partial(PartialMatch<'a>), } @@ -158,7 +161,6 @@ pub enum MatchType<'a> { pub struct PartialMatch<'a> { matching_words: Vec>, ids: &'a RangeInclusive, - char_len: usize, } impl<'a> PartialMatch<'a> { @@ -176,25 +178,24 @@ impl<'a> PartialMatch<'a> { None => token.is_stopword(), }; - let char_len = token.char_end - token.char_start; // if there are remaining words to match in the phrase and the current token is matching, // return a new Partial match allowing the highlighter to continue. if is_matching && matching_words.len() > 1 { matching_words.remove(0); - Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) + Some(MatchType::Partial(Self { matching_words, ids })) // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { - Some(MatchType::Full { char_len, ids }) + Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }) // if the current token doesn't match, return None to break the match sequence. } else { None } } - - pub fn char_len(&self) -> usize { - self.char_len - } } impl fmt::Debug for MatchingWords { @@ -276,7 +277,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) }) ); assert_eq!( matching_words @@ -300,7 +301,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words @@ -312,7 +313,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 4688b8f32..80e3ec7b2 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -1,11 +1,19 @@ -use std::borrow::Cow; +mod best_match_interval; +mod r#match; +mod matching_words; +mod simple_token_kind; use charabia::{Language, SeparatorKind, Token, Tokenizer}; +use either::Either; pub use matching_words::MatchingWords; -use matching_words::{MatchType, PartialMatch, WordId}; +use matching_words::{MatchType, PartialMatch}; +use r#match::{Match, MatchPosition}; use serde::Serialize; - -pub mod matching_words; +use simple_token_kind::SimpleTokenKind; +use std::{ + borrow::Cow, + cmp::{max, min}, +}; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -93,17 +101,6 @@ impl FormatOptions { } } -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, -} - #[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, @@ -130,41 +127,27 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// compute_partial_match peek into next words to validate if the match is complete. fn compute_partial_match<'a>( mut partial: PartialMatch<'a>, - token_position: usize, - word_position: usize, + first_token_position: usize, + first_word_position: usize, + first_word_char_start: &usize, words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - for (token_position, word_position, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarily save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } + Some(MatchType::Partial(partial)) => partial, // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - let ids: Vec<_> = ids.clone().collect(); - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.clone(), - word_position, - token_position, - }, - ); - matches.extend(iter); - + Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: char_len, - ids, - word_position, - token_position, + char_count: word.char_end - *first_word_char_start, + ids: ids.clone().collect(), + position: MatchPosition::Phrase { + word_positions: [first_word_position, word_position], + token_positions: [first_token_position, token_position], + }, }); // the match is complete, we return true. @@ -202,13 +185,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match match_type { // we match, we save the current token as a match, // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { + MatchType::Full { ids, char_count, .. } => { let ids: Vec<_> = ids.clone().collect(); matches.push(Match { - match_len: char_len, + char_count, ids, - word_position, - token_position, + position: MatchPosition::Word { word_position, token_position }, }); break; } @@ -221,6 +203,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { partial, token_position, word_position, + &word.char_start, &mut wp, &mut matches, ) { @@ -243,56 +226,99 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[m.token_position].byte_start, - length: m.match_len, + start: tokens[m.get_first_token_pos()].byte_start, + // TODO: Why is this in chars, while start is in bytes? + length: m.char_count, }) .collect(), } } /// Returns the bounds in byte index of the crop window. - fn crop_bounds( - &self, - tokens: &[Token<'_>], - matches: &[Match], - crop_size: usize, - ) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); - let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { + let ( + mut remaining_words, + is_iterating_forward, + before_tokens_starting_index, + after_tokens_starting_index, + ) = if !matches.is_empty() { + let [matches_first, matches_last] = + best_match_interval::find_best_match_interval(matches, crop_size); - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + let matches_size = + matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; + + let is_crop_size_gte_match_size = crop_size >= matches_size; + let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size; + + let remaining_words = if is_crop_size_gte_match_size { + crop_size - matches_size + } else { + // in case matches size is greater than crop size, which implies there's only one match, + // we count words backwards, because we have to remove words, as they're extra words outside of + // crop window + matches_size - crop_size + }; + + let after_tokens_starting_index = if matches_size == 0 { + 0 + } else { + let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1; + if last_match_last_token_position_plus_one < tokens.len() { + last_match_last_token_position_plus_one + } else { + // we have matched the end of possible tokens, there's nothing to advance + tokens.len() - 1 + } + }; + + ( + remaining_words, + is_iterating_forward, + if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 }, + after_tokens_starting_index, + ) + } else { + (crop_size, true, 0, 0) + }; // create the initial state of the crop window: 2 iterators starting from the matches positions, // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable(); + // an iterator ... + let mut after_tokens = if is_iterating_forward { + // ... starting from the last match token position and going towards the end of the text. + Either::Left(tokens[after_tokens_starting_index..].iter().peekable()) + } else { + // ... starting from the last match token position and going towards the start of the text. + Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable()) + }; // grows the crop window peeking in both directions // until the window contains the good number of words: while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.separator_kind()); - let after_token = after_tokens.peek().map(|t| t.separator_kind()); + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); + let after_token_kind = + after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new); - match (before_token, after_token) { + match (before_token_kind, after_token_kind) { // we can expand both sides. - (Some(before_token), Some(after_token)) => { - match (before_token, after_token) { + (Some(before_token_kind), Some(after_token_kind)) => { + match (before_token_kind, after_token_kind) { // if they are both separators and are the same kind then advance both, // or expand in the soft separator separator side. - (Some(before_token_kind), Some(after_token_kind)) => { - if before_token_kind == after_token_kind { + ( + SimpleTokenKind::Separator(before_token_separator_kind), + SimpleTokenKind::Separator(after_token_separator_kind), + ) => { + if before_token_separator_kind == after_token_separator_kind { before_tokens.next(); // this avoid having an ending separator before crop marker. if remaining_words > 1 { after_tokens.next(); } - } else if before_token_kind == SeparatorKind::Hard { + } else if matches!(before_token_separator_kind, SeparatorKind::Hard) { after_tokens.next(); } else { before_tokens.next(); @@ -300,17 +326,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } // if one of the tokens is a word, we expend in the side of the word. // left is a word, advance left. - (None, Some(_)) => { + (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { before_tokens.next(); remaining_words -= 1; } // right is a word, advance right. - (Some(_), None) => { + (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { after_tokens.next(); remaining_words -= 1; } // both are words, advance left then right if remaining_word > 0. - (None, None) => { + (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { before_tokens.next(); remaining_words -= 1; @@ -322,16 +348,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } // the end of the text is reached, advance left. - (Some(before_token), None) => { + (Some(before_token_kind), None) => { before_tokens.next(); - if before_token.is_none() { + if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } // the start of the text is reached, advance right. - (None, Some(after_token)) => { + (None, Some(after_token_kind)) => { after_tokens.next(); - if after_token.is_none() { + if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } @@ -344,86 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - (crop_byte_start, crop_byte_end) - } - - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - // compute distance between matches - distance_score -= (next_match.word_position - m.word_position).min(7) as i16; - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - (uniq_score, distance_score, order_score) - } - - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { - // we compute the matches interval if we have at least 2 matches. - if matches.len() > 1 { - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval = (0, 0); - let mut best_interval_score = self.match_interval_score(&matches[0..=0]); - // current interval positions. - let mut interval_first = 0; - let mut interval_last = 0; - for (index, next_match) in matches.iter().enumerate().skip(1) { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position >= crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; - } - - // advance start of the interval while interval is longer than crop_size. - while next_match.word_position - matches[interval_first].word_position - >= crop_size - { - interval_first += 1; - } - } - interval_last = index; - } - - // compute the last interval score and compare it to the best one. - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - } - - &matches[best_interval.0..=best_interval.1] - } else { - matches - } + [crop_byte_start, crop_byte_end] } // Returns the formatted version of the original text. @@ -434,69 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } else { match &self.matches { Some((tokens, matches)) => { - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { + // If the text has to be cropped, crop around the best interval. + let [crop_byte_start, crop_byte_end] = match format_options.crop { Some(crop_size) if crop_size > 0 => { - let matches = self.find_best_match_interval(matches, crop_size); self.crop_bounds(tokens, matches, crop_size) } - _ => (0, self.text.len()), + _ => [0, self.text.len()], }; let mut formatted = Vec::new(); // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { + if crop_byte_start > 0 && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } - let mut byte_index = byte_start; + let mut byte_index = crop_byte_start; if format_options.highlight { // insert highlight markers around matches. for m in matches { - let token = &tokens[m.token_position]; + let [m_byte_start, m_byte_end] = match m.position { + MatchPosition::Word { token_position, .. } => { + let token = &tokens[token_position]; + [&token.byte_start, &token.byte_end] + } + MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { + [&tokens[ftp].byte_start, &tokens[ltp].byte_end] + } + }; - // skip matches out of the crop window. - if token.byte_start < byte_start || token.byte_end > byte_end { + // skip matches out of the crop window + if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end { continue; } - if byte_index < token.byte_start { - formatted.push(&self.text[byte_index..token.byte_start]); + // adjust start and end to the crop window size + let [m_byte_start, m_byte_end] = [ + max(m_byte_start, &crop_byte_start), + min(m_byte_end, &crop_byte_end), + ]; + + // push text that is positioned before our matches + if byte_index < *m_byte_start { + formatted.push(&self.text[byte_index..*m_byte_start]); } - let highlight_byte_index = self.text[token.byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..highlight_byte_index]); + + // TODO: This is additional work done, charabia::token::Token byte_len + // should already get us the original byte length, however, that doesn't work as + // it's supposed to, investigate why + let highlight_byte_index = self.text[*m_byte_start..] + .char_indices() + .nth(m.char_count) + .map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end)); + formatted.push(&self.text[*m_byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < token.byte_end { - formatted.push(&self.text[highlight_byte_index..token.byte_end]); + if highlight_byte_index < *m_byte_end { + formatted.push(&self.text[highlight_byte_index..*m_byte_end]); } - byte_index = token.byte_end; + byte_index = *m_byte_end; } } // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); + if byte_index < crop_byte_end { + formatted.push(&self.text[byte_index..crop_byte_end]); } // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { + if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } if formatted.len() == 1 { // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) + Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end]) } else { Cow::Owned(formatted.concat()) } @@ -821,22 +786,24 @@ mod tests { fn format_highlight_crop_phrase_query() { //! testing: https://github.com/meilisearch/meilisearch/issues/3975 let temp_index = TempIndex::new(); + + let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; temp_index .add_documents(documents!([ - { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" } + { "id": 1, "text": text } ])) .unwrap(); + let rtxn = temp_index.read_txn().unwrap(); let format_options = FormatOptions { highlight: true, crop: Some(10) }; - let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); let mut matcher = builder.build(text, None); // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"…had the power to split the world between those who…" + @"…the power to split the world between those who embraced…" ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); @@ -844,7 +811,63 @@ mod tests { // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" + @"…world between those who embraced progress and those who resisted…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + // TODO: Should include exclamation mark without crop markers + @"…between those who embraced progress and those who resisted change…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"split the world between\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…groundbreaking invention had the power to split the world between…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…invention had the power to split the world between those…" ); } @@ -900,7 +923,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + @"_the_ _do or_ die can't be he do and or isn'_t he_" ); } } diff --git a/crates/milli/src/search/new/matches/simple_token_kind.rs b/crates/milli/src/search/new/matches/simple_token_kind.rs new file mode 100644 index 000000000..b34a8c985 --- /dev/null +++ b/crates/milli/src/search/new/matches/simple_token_kind.rs @@ -0,0 +1,15 @@ +use charabia::{SeparatorKind, Token, TokenKind}; + +pub enum SimpleTokenKind { + Separator(SeparatorKind), + NotSeparator, +} + +impl SimpleTokenKind { + pub fn new(token: &&Token<'_>) -> Self { + match token.kind { + TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), + _ => Self::NotSeparator, + } + } +}