From 3be179080321308baa4c4e82f741fefcb61e5869 Mon Sep 17 00:00:00 2001 From: ManyTheFish Date: Mon, 28 Mar 2022 15:57:05 +0200 Subject: [PATCH] Add crop algorithm with naive match algorithm --- milli/src/search/matches/mod.rs | 199 +++++++++++++++++++++++--------- 1 file changed, 144 insertions(+), 55 deletions(-) diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs index 0ebf6305f..fb3ab9c37 100644 --- a/milli/src/search/matches/mod.rs +++ b/milli/src/search/matches/mod.rs @@ -92,13 +92,15 @@ impl MatcherBuilder { // } // } -pub struct Match<'t> { - token: &'t Token<'t>, +#[derive(Clone)] +pub struct Match { match_len: usize, // id of the query word that matches. id: usize, // position of the word in the whole text. - position: usize, + word_position: usize, + // position of the token in the whole text. + token_position: usize, } pub struct MatchBounds { @@ -106,12 +108,6 @@ pub struct MatchBounds { length: usize, } -impl<'t> From<&Match<'t>> for MatchBounds { - fn from(m: &Match) -> Self { - MatchBounds { start: m.token.byte_start, length: m.match_len } - } -} - pub struct Matcher<'t, 'm> { text: &'t str, tokens: &'t [Token<'t>], @@ -120,26 +116,22 @@ pub struct Matcher<'t, 'm> { crop_marker: &'m str, highlight_prefix: &'m str, highlight_suffix: &'m str, - matches: Option>>, + matches: Option>, } impl<'t> Matcher<'t, '_> { fn compute_matches(&mut self) -> &mut Self { let mut matches = Vec::new(); - let mut position = 0; + let mut word_position = 0; + let mut token_position = 0; for token in self.tokens { - match token.is_separator() { - Some(SeparatorKind::Hard) => position += 7, - None => { - if let Some((match_len, id)) = - self.matching_words.matching_bytes_with_id(&token) - { - matches.push(Match { token, match_len, id, position }); - } - position += 1; + if token.is_separator().is_none() { + if let Some((match_len, id)) = self.matching_words.matching_bytes_with_id(&token) { + matches.push(Match { match_len, id, word_position, token_position }); } - _otherwise => {} + word_position += 1; } + token_position += 1; } self.matches = Some(matches); @@ -149,21 +141,104 @@ impl<'t> Matcher<'t, '_> { pub fn matches(&mut self) -> Vec { match &self.matches { None => self.compute_matches().matches(), - Some(matches) => matches.iter().map(MatchBounds::from).collect(), + Some(matches) => matches + .iter() + .map(|m| MatchBounds { + start: self.tokens[m.token_position].byte_start, + length: m.match_len, + }) + .collect(), } } - fn crop_bounds(&self, matches: &[Match<'t>]) -> (usize, usize) { - let byte_end = self - .tokens - .iter() - .filter(|t| t.is_separator().is_none()) - .enumerate() - .take_while(|(i, _)| *i < self.crop_size) - .last() - .map_or(self.text.len(), |(_, t)| t.byte_end); + fn crop_around(&self, matches: &[Match]) -> (usize, usize) { + let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); + let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); - (0, byte_end) + // TODO: buggy if no match and fisrt token is a sepparator + let mut remaining_words = + self.crop_size + first_match_word_position - last_match_word_position - 1; + let mut first_token_position = first_match_token_position; + let mut last_token_position = last_match_token_position; + + while remaining_words > 0 { + match ( + first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), + last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), + ) { + (Some(ft), Some(lt)) => { + match (ft.is_separator(), lt.is_separator()) { + // if they are both separators and are the same kind then advance both + (Some(f_kind), Some(s_kind)) => { + if f_kind == s_kind { + first_token_position -= 1; + last_token_position += 1; + } else if f_kind == SeparatorKind::Hard { + last_token_position += 1; + } else { + first_token_position -= 1; + } + } + // left is a word, advance left + (None, Some(_)) => { + first_token_position -= 1; + remaining_words -= 1; + } + // right is a word, advance right + (Some(_), None) => { + last_token_position += 1; + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0 + (None, None) => { + first_token_position -= 1; + remaining_words -= 1; + + if remaining_words > 0 { + last_token_position += 1; + remaining_words -= 1; + } + } + } + } + (Some(ft), None) => { + first_token_position -= 1; + if ft.is_separator().is_none() { + remaining_words -= 1; + } + } + (None, Some(lt)) => { + last_token_position += 1; + if lt.is_separator().is_none() { + remaining_words -= 1; + } + } + (None, None) => break, + } + } + + // if tokens after the end of the window are separators, + // then add them to the window in order to keep context in cropped text. + while let Some(_separator_kind) = last_token_position + .checked_add(1) + .and_then(|i| self.tokens.get(i)) + .and_then(|t| t.is_separator()) + { + last_token_position += 1; + } + + (self.tokens[first_token_position].byte_start, self.tokens[last_token_position].byte_end) + } + + fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { + match matches { + // at least 2 matches + [first, last, ..] => self.crop_around(&[first.clone()][..]), + // less than 2 matches + _ => self.crop_around(matches), + } } pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { @@ -187,20 +262,23 @@ impl<'t> Matcher<'t, '_> { if highlight { // insert highlight markers around matches. + let tokens = self.tokens; for m in matches .iter() - .skip_while(|m| m.token.byte_start < byte_start) - .take_while(|m| m.token.byte_start < byte_end) + .skip_while(|m| tokens[m.token_position].byte_start < byte_start) + .take_while(|m| tokens[m.token_position].byte_start < byte_end) { - if byte_index < m.token.byte_start { - formatted.push(&self.text[byte_index..m.token.byte_start]); + let token = &tokens[m.token_position]; + + if byte_index < token.byte_start { + formatted.push(&self.text[byte_index..token.byte_start]); } formatted.push(self.highlight_prefix); - formatted.push(&self.text[m.token.byte_start..m.token.byte_end]); + formatted.push(&self.text[token.byte_start..token.byte_end]); formatted.push(self.highlight_suffix); - byte_index = m.token.byte_end; + byte_index = token.byte_end; } } @@ -271,7 +349,7 @@ mod tests { assert_eq!(&matcher.format(highlight, crop), &text); // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); @@ -306,12 +384,12 @@ mod tests { assert_eq!(&matcher.format(highlight, crop), &text); // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); // no crop should return complete text with highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"); + assert_eq!(&matcher.format(highlight, crop), "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."); // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; @@ -343,18 +421,18 @@ mod tests { // no highlight should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right…" + "A quick brown fox can not jump 32 feet, right? …" ); - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; + // Test phrase propagation + let text = "Natalie risk her future. Split The World is a book written by Emily Henry. I never read it."; let analyzed = analyzer.analyze(&text); let tokens: Vec<_> = analyzed.tokens().collect(); let mut matcher = builder.build(&tokens[..], text); - // no highlight should return 10 last words with a marker at the start. + // should crop the phrase instead of croping around the match. assert_eq!( &matcher.format(highlight, crop), - "…she loves. Emily Henry: The Love That Split The World" + "…Split The World is a book written by Emily Henry. …" ); // Text containing some matches. @@ -368,6 +446,17 @@ mod tests { "…future to build a world with the boy she loves." ); + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // no highlight should return 10 last words with a marker at the start. + assert_eq!( + &matcher.format(highlight, crop), + "…she loves. Emily Henry: The Love That Split The World." + ); + // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let analyzed = analyzer.analyze(&text); @@ -398,17 +487,9 @@ mod tests { // both should return 10 first words with a marker at the end. assert_eq!( &matcher.format(highlight, crop), - "A quick brown fox can not jump 32 feet, right…" + "A quick brown fox can not jump 32 feet, right? …" ); - // Text containing all matches. - let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World"; - let analyzed = analyzer.analyze(&text); - let tokens: Vec<_> = analyzed.tokens().collect(); - let mut matcher = builder.build(&tokens[..], text); - // both should return 10 last words with a marker at the start and highlighted matches. - assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World"); - // Text containing some matches. let text = "Natalie risk her future to build a world with the boy she loves."; let analyzed = analyzer.analyze(&text); @@ -420,6 +501,14 @@ mod tests { "…future to build a world with the boy she loves." ); + // Text containing all matches. + let text = "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."; + let analyzed = analyzer.analyze(&text); + let tokens: Vec<_> = analyzed.tokens().collect(); + let mut matcher = builder.build(&tokens[..], text); + // both should return 10 last words with a marker at the start and highlighted matches. + assert_eq!(&matcher.format(highlight, crop), "…she loves. Emily Henry: The Love That Split The World."); + // Text containing a match unordered and a match ordered. let text = "The world split void void void void void void void void void split the world void void"; let analyzed = analyzer.analyze(&text);