From edcb4c60ba0bc416152bdfd931598bfa0df87467 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 12 Sep 2024 09:44:37 +0300 Subject: [PATCH 01/21] Change Matcher so that phrases are counted as one instead of word by word --- milli/src/search/new/matches/mod.rs | 45 +++++++++++------------------ 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 4688b8f32..6ddb81c6a 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -132,37 +132,21 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { mut partial: PartialMatch<'a>, token_position: usize, word_position: usize, + first_word_char_start: &usize, words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - let mut potential_matches = vec![(token_position, word_position, partial.char_len())]; - - for (token_position, word_position, word) in words_positions { + for (_, _, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarily save the current token then we try to match the next one. - Some(MatchType::Partial(partial)) => { - potential_matches.push((token_position, word_position, partial.char_len())); - partial - } + Some(MatchType::Partial(partial)) => partial, // partial match is now full, we keep this matches and we advance positions - Some(MatchType::Full { char_len, ids }) => { - let ids: Vec<_> = ids.clone().collect(); - // save previously matched tokens as matches. - let iter = potential_matches.into_iter().map( - |(token_position, word_position, match_len)| Match { - match_len, - ids: ids.clone(), - word_position, - token_position, - }, - ); - matches.extend(iter); - + Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: char_len, - ids, + match_len: word.char_end - first_word_char_start, + ids: ids.clone().collect(), word_position, token_position, }); @@ -221,6 +205,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { partial, token_position, word_position, + &word.char_start, &mut wp, &mut matches, ) { @@ -472,15 +457,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .enumerate() .find(|(i, _)| *i == m.match_len) .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + formatted.push(self.highlight_prefix); formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); + // if it's a prefix highlight, we put the end of the word after the highlight marker. if highlight_byte_index < token.byte_end { formatted.push(&self.text[highlight_byte_index..token.byte_end]); } - byte_index = token.byte_end; + byte_index = token.byte_start + m.match_len; } } @@ -821,22 +808,24 @@ mod tests { fn format_highlight_crop_phrase_query() { //! testing: https://github.com/meilisearch/meilisearch/issues/3975 let temp_index = TempIndex::new(); + + let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; temp_index .add_documents(documents!([ - { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" } + { "id": 1, "text": text } ])) .unwrap(); + let rtxn = temp_index.read_txn().unwrap(); let format_options = FormatOptions { highlight: true, crop: Some(10) }; - let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); let mut matcher = builder.build(text, None); // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"…had the power to split the world between those who…" + @"…had the power to split the world between those who…" ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); @@ -844,7 +833,7 @@ mod tests { // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" + @"…world between those who embraced progress and those who resisted…" ); } @@ -900,7 +889,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_" + @"_the_ _do or_ die can't be he do and or isn'_t he_" ); } } From e7af499314f24e51f1bff27ff231ceb898aa27a1 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 12 Sep 2024 16:58:13 +0300 Subject: [PATCH 02/21] Improve changes to Matcher --- milli/src/search/new/matches/mod.rs | 136 +++++++++++++++++++++------- 1 file changed, 104 insertions(+), 32 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 6ddb81c6a..26dd6f6e8 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -93,15 +93,28 @@ impl FormatOptions { } } +#[derive(Clone, Debug)] +pub enum MatchPosition { + Word { + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, + }, + Phrase { + // position of the first and last word in the phrase in the whole text. + word_positions: (usize, usize), + // position of the first and last token in the phrase in the whole text. + token_positions: (usize, usize), + }, +} + #[derive(Clone, Debug)] pub struct Match { match_len: usize, // ids of the query words that matches. ids: Vec, - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, + position: MatchPosition, } #[derive(Serialize, Debug, Clone, PartialEq, Eq)] @@ -130,13 +143,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// compute_partial_match peek into next words to validate if the match is complete. fn compute_partial_match<'a>( mut partial: PartialMatch<'a>, - token_position: usize, - word_position: usize, + first_token_position: usize, + first_word_position: usize, first_word_char_start: &usize, words_positions: &mut impl Iterator)>, matches: &mut Vec, ) -> bool { - for (_, _, word) in words_positions { + for (token_position, word_position, word) in words_positions { partial = match partial.match_token(word) { // token matches the partial match, but the match is not full, // we temporarily save the current token then we try to match the next one. @@ -145,10 +158,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: word.char_end - first_word_char_start, + match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), - word_position, - token_position, + position: MatchPosition::Phrase { + word_positions: (first_word_position, word_position), + token_positions: (first_token_position, token_position), + }, }); // the match is complete, we return true. @@ -191,8 +206,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { matches.push(Match { match_len: char_len, ids, - word_position, - token_position, + position: MatchPosition::Word { word_position, token_position }, }); break; } @@ -228,13 +242,47 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[m.token_position].byte_start, + start: tokens[match m.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { + token_positions: (first_token_position, _), + .. + } => first_token_position, + }] + .byte_start, length: m.match_len, }) .collect(), } } + // @TODO: This should be improved, looks nasty + fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize { + match m.position { + MatchPosition::Word { word_position, token_position } => { + if is_word { + word_position + } else { + token_position + } + } + MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => { + if is_word { + if is_first { + return wpf; + } else { + return wpl; + } + } + if is_first { + tpf + } else { + tpl + } + } + } + } + /// Returns the bounds in byte index of the crop window. fn crop_bounds( &self, @@ -243,10 +291,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); - let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); + let first_match_word_position = + matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0); + let first_match_token_position = + matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0); + let last_match_word_position = + matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0); + let last_match_token_position = + matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -350,7 +402,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } // compute distance between matches - distance_score -= (next_match.word_position - m.word_position).min(7) as i16; + distance_score -= (self.get_match_pos(next_match, true, true) + - self.get_match_pos(m, true, true)) + .min(7) as i16; } ids.extend(m.ids.iter()); @@ -378,7 +432,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - if next_match.word_position - matches[interval_first].word_position >= crop_size { + let next_match_word_position = self.get_match_pos(next_match, true, true); + + if next_match_word_position + - self.get_match_pos(&matches[interval_first], false, true) + >= crop_size + { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -389,10 +448,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } // advance start of the interval while interval is longer than crop_size. - while next_match.word_position - matches[interval_first].word_position - >= crop_size - { + loop { interval_first += 1; + + if next_match_word_position + - self.get_match_pos(&matches[interval_first], false, true) + < crop_size + { + break; + } } } interval_last = index; @@ -441,33 +505,41 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { if format_options.highlight { // insert highlight markers around matches. for m in matches { - let token = &tokens[m.token_position]; + let (current_byte_start, current_byte_end) = match m.position { + MatchPosition::Word { token_position, .. } => { + let token = &tokens[token_position]; + (&token.byte_start, &token.byte_end) + } + MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => { + (&tokens[ftp].byte_start, &tokens[ltp].byte_end) + } + }; // skip matches out of the crop window. - if token.byte_start < byte_start || token.byte_end > byte_end { + if *current_byte_start < byte_start || *current_byte_end > byte_end { continue; } - if byte_index < token.byte_start { - formatted.push(&self.text[byte_index..token.byte_start]); + if byte_index < *current_byte_start { + formatted.push(&self.text[byte_index..*current_byte_start]); } - let highlight_byte_index = self.text[token.byte_start..] + let highlight_byte_index = self.text[*current_byte_start..] .char_indices() .enumerate() .find(|(i, _)| *i == m.match_len) - .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start); + .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start); formatted.push(self.highlight_prefix); - formatted.push(&self.text[token.byte_start..highlight_byte_index]); + formatted.push(&self.text[*current_byte_start..highlight_byte_index]); formatted.push(self.highlight_suffix); // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < token.byte_end { - formatted.push(&self.text[highlight_byte_index..token.byte_end]); + if highlight_byte_index < *current_byte_end { + formatted.push(&self.text[highlight_byte_index..*current_byte_end]); } - byte_index = token.byte_start + m.match_len; + byte_index = *current_byte_end; } } From cc6a2aec06ebd6cb7332afb0478affe3e63185af Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:31:07 +0300 Subject: [PATCH 03/21] Improve changes to Matcher --- milli/src/search/new/matches/mod.rs | 78 +++++++++++++++-------------- 1 file changed, 41 insertions(+), 37 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 26dd6f6e8..a84b25923 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -93,6 +93,16 @@ impl FormatOptions { } } +enum FL { + First, + Last, +} + +enum WT { + Word, + Token, +} + #[derive(Clone, Debug)] pub enum MatchPosition { Word { @@ -256,28 +266,22 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } - // @TODO: This should be improved, looks nasty - fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize { + fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize { match m.position { - MatchPosition::Word { word_position, token_position } => { - if is_word { - word_position - } else { - token_position - } - } - MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => { - if is_word { - if is_first { - return wpf; - } else { - return wpl; - } - } - if is_first { - tpf - } else { - tpl + MatchPosition::Word { word_position, token_position } => match wt { + WT::Word => word_position, + WT::Token => token_position, + }, + MatchPosition::Phrase { word_positions: (fwp, lwp), token_positions: (ftp, ltp) } => { + match wt { + WT::Word => match fl { + FL::First => fwp, + FL::Last => lwp, + }, + WT::Token => match fl { + FL::First => ftp, + FL::Last => ltp, + }, } } } @@ -292,13 +296,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = - matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0); + matches.first().map(|m| self.get_match_pos(m, WT::Word, FL::First)).unwrap_or(0); let first_match_token_position = - matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0); + matches.first().map(|m| self.get_match_pos(m, WT::Token, FL::First)).unwrap_or(0); let last_match_word_position = - matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0); + matches.last().map(|m| self.get_match_pos(m, WT::Word, FL::Last)).unwrap_or(0); let last_match_token_position = - matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0); + matches.last().map(|m| self.get_match_pos(m, WT::Token, FL::Last)).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -401,10 +405,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { order_score += 1; } + let next_match_first_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); + let current_match_first_word_pos = self.get_match_pos(m, WT::Word, FL::First); + // compute distance between matches - distance_score -= (self.get_match_pos(next_match, true, true) - - self.get_match_pos(m, true, true)) - .min(7) as i16; + distance_score -= + (next_match_first_word_pos - current_match_first_word_pos).min(7) as i16; } ids.extend(m.ids.iter()); @@ -432,12 +438,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_position = self.get_match_pos(next_match, true, true); + let next_match_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); + let mut interval_first_match_word_pos = + self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); - if next_match_word_position - - self.get_match_pos(&matches[interval_first], false, true) - >= crop_size - { + if next_match_word_pos - interval_first_match_word_pos >= crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -450,11 +455,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // advance start of the interval while interval is longer than crop_size. loop { interval_first += 1; + interval_first_match_word_pos = + self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); - if next_match_word_position - - self.get_match_pos(&matches[interval_first], false, true) - < crop_size - { + if next_match_word_pos - interval_first_match_word_pos < crop_size { break; } } From 65e3d61a955dd9b0f4b877d17a0b2b0dc087816c Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:35:58 +0300 Subject: [PATCH 04/21] Make use of helper function in one more place --- milli/src/search/new/matches/mod.rs | 35 ++++++++++++----------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index a84b25923..5a4f0b914 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -245,27 +245,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { self } - /// Returns boundaries of the words that match the query. - pub fn matches(&mut self) -> Vec { - match &self.matches { - None => self.compute_matches().matches(), - Some((tokens, matches)) => matches - .iter() - .map(|m| MatchBounds { - start: tokens[match m.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { - token_positions: (first_token_position, _), - .. - } => first_token_position, - }] - .byte_start, - length: m.match_len, - }) - .collect(), - } - } - fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize { match m.position { MatchPosition::Word { word_position, token_position } => match wt { @@ -287,6 +266,20 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } + /// Returns boundaries of the words that match the query. + pub fn matches(&mut self) -> Vec { + match &self.matches { + None => self.compute_matches().matches(), + Some((tokens, matches)) => matches + .iter() + .map(|m| MatchBounds { + start: tokens[self.get_match_pos(m, WT::Token, FL::First)].byte_start, + length: m.match_len, + }) + .collect(), + } + } + /// Returns the bounds in byte index of the crop window. fn crop_bounds( &self, From cab63abc845d87350ab36c07d3999b58eebd0eaa Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 14:35:28 +0300 Subject: [PATCH 05/21] Improve MatchesPosition enum with an impl --- milli/src/search/new/matches/mod.rs | 81 ++++++++++++++--------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 5a4f0b914..ce878a1eb 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -93,16 +93,6 @@ impl FormatOptions { } } -enum FL { - First, - Last, -} - -enum WT { - Word, - Token, -} - #[derive(Clone, Debug)] pub enum MatchPosition { Word { @@ -127,6 +117,36 @@ pub struct Match { position: MatchPosition, } +impl MatchPosition { + fn get_first_word(m: &Match) -> usize { + match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, + } + } + + fn get_last_word(m: &Match) -> usize { + match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp, + } + } + + fn get_first_token(m: &Match) -> usize { + match m.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp, + } + } + + fn get_last_token(m: &Match) -> usize { + match m.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, + } + } +} + #[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, @@ -245,27 +265,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { self } - fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize { - match m.position { - MatchPosition::Word { word_position, token_position } => match wt { - WT::Word => word_position, - WT::Token => token_position, - }, - MatchPosition::Phrase { word_positions: (fwp, lwp), token_positions: (ftp, ltp) } => { - match wt { - WT::Word => match fl { - FL::First => fwp, - FL::Last => lwp, - }, - WT::Token => match fl { - FL::First => ftp, - FL::Last => ltp, - }, - } - } - } - } - /// Returns boundaries of the words that match the query. pub fn matches(&mut self) -> Vec { match &self.matches { @@ -273,7 +272,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[self.get_match_pos(m, WT::Token, FL::First)].byte_start, + start: tokens[MatchPosition::get_first_token(m)].byte_start, length: m.match_len, }) .collect(), @@ -289,13 +288,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = - matches.first().map(|m| self.get_match_pos(m, WT::Word, FL::First)).unwrap_or(0); + matches.first().map(|m| MatchPosition::get_first_word(m)).unwrap_or(0); let first_match_token_position = - matches.first().map(|m| self.get_match_pos(m, WT::Token, FL::First)).unwrap_or(0); + matches.first().map(|m| MatchPosition::get_first_token(m)).unwrap_or(0); let last_match_word_position = - matches.last().map(|m| self.get_match_pos(m, WT::Word, FL::Last)).unwrap_or(0); + matches.last().map(|m| MatchPosition::get_last_word(m)).unwrap_or(0); let last_match_token_position = - matches.last().map(|m| self.get_match_pos(m, WT::Token, FL::Last)).unwrap_or(0); + matches.last().map(|m| MatchPosition::get_last_token(m)).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -398,8 +397,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { order_score += 1; } - let next_match_first_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); - let current_match_first_word_pos = self.get_match_pos(m, WT::Word, FL::First); + let next_match_first_word_pos = MatchPosition::get_first_word(next_match); + let current_match_first_word_pos = MatchPosition::get_first_word(m); // compute distance between matches distance_score -= @@ -431,9 +430,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_pos = self.get_match_pos(next_match, WT::Word, FL::First); + let next_match_word_pos = MatchPosition::get_first_word(next_match); let mut interval_first_match_word_pos = - self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); + MatchPosition::get_last_word(&matches[interval_first]); if next_match_word_pos - interval_first_match_word_pos >= crop_size { let interval_score = @@ -449,7 +448,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { loop { interval_first += 1; interval_first_match_word_pos = - self.get_match_pos(&matches[interval_first], WT::Word, FL::Last); + MatchPosition::get_last_word(&matches[interval_first]); if next_match_word_pos - interval_first_match_word_pos < crop_size { break; From a2a16bf846066f422a5e6bd9bcb0009a894dcad0 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 13 Sep 2024 21:20:06 +0300 Subject: [PATCH 06/21] Move MatchPosition impl to Match, adjust counting score for phrases --- milli/src/search/new/matches/mod.rs | 66 +++++++++++++++++++---------- 1 file changed, 43 insertions(+), 23 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ce878a1eb..e63920145 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -117,30 +117,30 @@ pub struct Match { position: MatchPosition, } -impl MatchPosition { - fn get_first_word(m: &Match) -> usize { - match m.position { +impl Match { + fn get_first_word_pos(&self) -> usize { + match self.position { MatchPosition::Word { word_position, .. } => word_position, MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, } } - fn get_last_word(m: &Match) -> usize { - match m.position { + fn get_last_word_pos(&self) -> usize { + match self.position { MatchPosition::Word { word_position, .. } => word_position, MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp, } } - fn get_first_token(m: &Match) -> usize { - match m.position { + fn get_first_token_pos(&self) -> usize { + match self.position { MatchPosition::Word { token_position, .. } => token_position, MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp, } } - fn get_last_token(m: &Match) -> usize { - match m.position { + fn get_last_token_pos(&self) -> usize { + match self.position { MatchPosition::Word { token_position, .. } => token_position, MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, } @@ -272,7 +272,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some((tokens, matches)) => matches .iter() .map(|m| MatchBounds { - start: tokens[MatchPosition::get_first_token(m)].byte_start, + start: tokens[m.get_first_token_pos()].byte_start, length: m.match_len, }) .collect(), @@ -288,13 +288,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. let first_match_word_position = - matches.first().map(|m| MatchPosition::get_first_word(m)).unwrap_or(0); + matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0); let first_match_token_position = - matches.first().map(|m| MatchPosition::get_first_token(m)).unwrap_or(0); - let last_match_word_position = - matches.last().map(|m| MatchPosition::get_last_word(m)).unwrap_or(0); - let last_match_token_position = - matches.last().map(|m| MatchPosition::get_last_token(m)).unwrap_or(0); + matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0); + let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); + let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); // matches needs to be counted in the crop len. let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; @@ -389,6 +387,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let mut order_score = 0; let mut distance_score = 0; + // Count score for phrases + let tally_phrase_scores = + |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; + }; + let mut iter = matches.iter().peekable(); while let Some(m) = iter.next() { if let Some(next_match) = iter.peek() { @@ -397,12 +405,24 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { order_score += 1; } - let next_match_first_word_pos = MatchPosition::get_first_word(next_match); - let current_match_first_word_pos = MatchPosition::get_first_word(m); + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + + let next_match_first_word_pos = match next_match.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, + }; // compute distance between matches - distance_score -= - (next_match_first_word_pos - current_match_first_word_pos).min(7) as i16; + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); } ids.extend(m.ids.iter()); @@ -430,9 +450,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_pos = MatchPosition::get_first_word(next_match); + let next_match_word_pos = next_match.get_last_word_pos(); let mut interval_first_match_word_pos = - MatchPosition::get_last_word(&matches[interval_first]); + matches[interval_first].get_first_word_pos(); if next_match_word_pos - interval_first_match_word_pos >= crop_size { let interval_score = @@ -448,7 +468,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { loop { interval_first += 1; interval_first_match_word_pos = - MatchPosition::get_last_word(&matches[interval_first]); + matches[interval_first].get_first_word_pos(); if next_match_word_pos - interval_first_match_word_pos < crop_size { break; From 51085206ccab6e8e0098c4cf8b2a3e67e06558a4 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Sat, 14 Sep 2024 10:14:07 +0300 Subject: [PATCH 07/21] Misc adjustments --- milli/src/search/new/matches/mod.rs | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index e63920145..414509cd3 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -387,7 +387,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let mut order_score = 0; let mut distance_score = 0; - // Count score for phrases + // count score for phrases let tally_phrase_scores = |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| { let words_in_phrase_minus_one = (lwp - fwp) as i16; @@ -450,11 +450,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. - let next_match_word_pos = next_match.get_last_word_pos(); - let mut interval_first_match_word_pos = + let next_match_last_word_pos = next_match.get_last_word_pos(); + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - if next_match_word_pos - interval_first_match_word_pos >= crop_size { + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); @@ -467,10 +467,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // advance start of the interval while interval is longer than crop_size. loop { interval_first += 1; - interval_first_match_word_pos = + interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - if next_match_word_pos - interval_first_match_word_pos < crop_size { + if next_match_last_word_pos - interval_first_match_first_word_pos + < crop_size + { break; } } From 993408d3ba65cbcea9920caeab8b421160a931ac Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Sun, 15 Sep 2024 16:15:09 +0300 Subject: [PATCH 08/21] Change closure to fn --- milli/src/search/new/matches/mod.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 414509cd3..df110aff9 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -388,14 +388,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let mut distance_score = 0; // count score for phrases - let tally_phrase_scores = - |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; - }; + fn tally_phrase_scores( + fwp: &usize, + lwp: &usize, + order_score: &mut i16, + distance_score: &mut i16, + ) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; + } let mut iter = matches.iter().peekable(); while let Some(m) = iter.next() { From f7337affd6342ae495d99312862b300e7af461e0 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:31:09 +0300 Subject: [PATCH 09/21] Adjust tests to changes --- meilisearch/tests/search/locales.rs | 44 ++++++++++++++--------------- milli/src/search/new/matches/mod.rs | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs index dbc4fcc30..b9e70c5b1 100644 --- a/meilisearch/tests/search/locales.rs +++ b/meilisearch/tests/search/locales.rs @@ -400,9 +400,9 @@ async fn force_locales() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -447,9 +447,9 @@ async fn force_locales() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -524,9 +524,9 @@ async fn force_locales_with_pattern() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -571,9 +571,9 @@ async fn force_locales_with_pattern() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -689,8 +689,8 @@ async fn force_locales_with_pattern_nested() { "author": "諫山 創" }, "document_zh": { - "name": "巨人", - "description": "巨人是日本的漫画系列,由諫山 創作画。", + "name": "进击的巨人", + "description": "进击的巨人是日本的漫画系列,由諫山 創作画。", "author": "諫山創" }, "id": "852", @@ -788,9 +788,9 @@ async fn force_different_locales_with_pattern() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -889,9 +889,9 @@ async fn auto_infer_locales_at_search_with_attributes_to_search_on() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -965,9 +965,9 @@ async fn auto_infer_locales_at_search() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -1011,9 +1011,9 @@ async fn auto_infer_locales_at_search() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -1057,9 +1057,9 @@ async fn auto_infer_locales_at_search() { ] }, "_formatted": { - "name_zh": "巨人", + "name_zh": "进击的巨人", "author_zh": "諫山創", - "description_zh": "巨人是日本的漫画系列,由諫山 創作画。", + "description_zh": "进击的巨人是日本的漫画系列,由諫山 創作画。", "id": "853", "_vectors": { "manual": [ @@ -1177,8 +1177,8 @@ async fn force_different_locales_with_pattern_nested() { "author": "諫山 創" }, "document_zh": { - "name": "巨人", - "description": "巨人是日本的漫画系列,由諫山 創作画。", + "name": "进击的巨人", + "description": "进击的巨人是日本的漫画系列,由諫山 創作画。", "author": "諫山創" }, "id": "852", diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index df110aff9..09d3db575 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -919,7 +919,7 @@ mod tests { // should return 10 words with a marker at the start as well the end, and the highlighted matches. insta::assert_snapshot!( matcher.format(format_options), - @"…had the power to split the world between those who…" + @"…the power to split the world between those who embraced…" ); let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); From 83113998f99bb6d59bb9e94e9ef3e527f4c93f62 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Wed, 18 Sep 2024 10:35:23 +0300 Subject: [PATCH 10/21] Add more test assertions --- milli/src/search/new/matches/mod.rs | 36 +++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 09d3db575..8a84f91bd 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -929,6 +929,42 @@ mod tests { matcher.format(format_options), @"…world between those who embraced progress and those who resisted…" ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world\"", + ); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world between\"", + ); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world …" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"", + ); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"…between those who embraced progress and those who resisted change…" + ); } #[test] From 0ffeea5a5209f1e206720e3cf63d7fe627b8cee0 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 19 Sep 2024 09:06:40 +0300 Subject: [PATCH 11/21] Remove wrong comments --- milli/src/search/new/matches/mod.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8a84f91bd..26115c39b 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -936,7 +936,6 @@ mod tests { "\"The groundbreaking invention had the power to split the world\"", ); let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), @"The groundbreaking invention had the power to split the world…" @@ -948,7 +947,6 @@ mod tests { "\"The groundbreaking invention had the power to split the world between\"", ); let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), @"The groundbreaking invention had the power to split the world …" @@ -960,7 +958,6 @@ mod tests { "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"", ); let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), @"…between those who embraced progress and those who resisted change…" From d20a39b9599f7962b2a316e45cc126f90a3d8eed Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 27 Sep 2024 15:44:30 +0300 Subject: [PATCH 12/21] Refactor find_best_match_interval --- milli/src/search/new/matches/mod.rs | 154 +++++++++++++++++++--------- 1 file changed, 106 insertions(+), 48 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 26115c39b..bbd39e682 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -442,36 +442,48 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// Returns the matches interval where the score computed by match_interval_score is the best. fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { + let matches_len = matches.len(); + // we compute the matches interval if we have at least 2 matches. - if matches.len() > 1 { + if matches_len > 1 { + // current interval positions. + let mut interval_first = 0; // positions of the first and the last match of the best matches interval in `matches`. let mut best_interval = (0, 0); let mut best_interval_score = self.match_interval_score(&matches[0..=0]); - // current interval positions. - let mut interval_first = 0; - let mut interval_last = 0; - for (index, next_match) in matches.iter().enumerate().skip(1) { + + let mut index = 1; + while index < matches_len - 1 { + let next_match = &matches[index]; + // if next match would make interval gross more than crop_size, // we compare the current interval with the best one, // then we increase `interval_first` until next match can be added. let next_match_last_word_pos = next_match.get_last_word_pos(); - let mut interval_first_match_first_word_pos = + let interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); + // skip for 1, because it would result in the same as our very first interval score + if index != 1 { + let interval_last = index - 1; + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; + // keep interval if it's the best + if interval_score > best_interval_score { + best_interval = (interval_first, interval_last); + best_interval_score = interval_score; + } } // advance start of the interval while interval is longer than crop_size. loop { interval_first += 1; - interval_first_match_first_word_pos = + let interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); if next_match_last_word_pos - interval_first_match_first_word_pos @@ -481,10 +493,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } } } - interval_last = index; + + index += 1; } // compute the last interval score and compare it to the best one. + let interval_last = matches_len - 1; let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); if interval_score > best_interval_score { @@ -914,32 +928,32 @@ mod tests { let format_options = FormatOptions { highlight: true, crop: Some(10) }; - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); - let mut matcher = builder.build(text, None); - // should return 10 words with a marker at the start as well the end, and the highlighted matches. - insta::assert_snapshot!( - matcher.format(format_options), - @"…the power to split the world between those who embraced…" - ); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); + // let mut matcher = builder.build(text, None); + // // should return 10 words with a marker at the start as well the end, and the highlighted matches. + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…the power to split the world between those who embraced…" + // ); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); - let mut matcher = builder.build(text, None); - // should highlight "those" and the phrase "and those". - insta::assert_snapshot!( - matcher.format(format_options), - @"…world between those who embraced progress and those who resisted…" - ); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); + // let mut matcher = builder.build(text, None); + // // should highlight "those" and the phrase "and those". + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…groundbreaking invention had the power to split the world between…" + // ); - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention had the power to split the world\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"The groundbreaking invention had the power to split the world…" - ); + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention had the power to split the world\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"The groundbreaking invention had the power to split the world…" + // ); let builder = MatcherBuilder::new_test( &rtxn, @@ -952,16 +966,60 @@ mod tests { @"The groundbreaking invention had the power to split the world …" ); - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"…between those who embraced progress and those who resisted change…" - ); + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…between those who embraced progress and those who resisted change…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention\" \"split the world between those\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…the power to split the world between those who embraced…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"groundbreaking invention\" \"split the world between\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…groundbreaking invention had the power to split the world between…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"groundbreaking invention\" \"had the power to split the world between those\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…invention had the power to split the world between those…" + // ); + + // let builder = MatcherBuilder::new_test( + // &rtxn, + // &temp_index, + // "\"The groundbreaking invention\" \"had the power to split the world between those\"", + // ); + // let mut matcher = builder.build(text, None); + // insta::assert_snapshot!( + // matcher.format(format_options), + // @"…invention had the power to split the world between those…" + // ); } #[test] From eabc14c26858d9f0bda89e6fa38f0aa4b0244be8 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Mon, 30 Sep 2024 21:24:41 +0300 Subject: [PATCH 13/21] Refactor, handle more cases for phrases --- .../src/search/new/matches/matching_words.rs | 2 +- milli/src/search/new/matches/mod.rs | 497 ++++++++++-------- 2 files changed, 291 insertions(+), 208 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 4ad5c37ec..4deaff6a0 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -181,7 +181,7 @@ impl<'a> PartialMatch<'a> { // return a new Partial match allowing the highlighter to continue. if is_matching && matching_words.len() > 1 { matching_words.remove(0); - Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len })) + Some(MatchType::Partial(Self { matching_words, ids, char_len })) // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index bbd39e682..624287f5f 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,6 +1,6 @@ use std::borrow::Cow; -use charabia::{Language, SeparatorKind, Token, Tokenizer}; +use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer}; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; @@ -145,6 +145,13 @@ impl Match { MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, } } + + fn get_word_count(&self) -> usize { + match self.position { + MatchPosition::Word { .. } => 1, + MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1, + } + } } #[derive(Serialize, Debug, Clone, PartialEq, Eq)] @@ -153,6 +160,27 @@ pub struct MatchBounds { pub length: usize, } +enum SimpleTokenKind { + Separator(SeparatorKind), + NotSeparator, +} + +impl SimpleTokenKind { + fn get(token: &&Token<'_>) -> Self { + match token.kind { + TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), + _ => Self::NotSeparator, + } + } + + fn is_not_separator(&self) -> bool { + match self { + SimpleTokenKind::NotSeparator => true, + SimpleTokenKind::Separator(_) => false, + } + } +} + /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -287,95 +315,130 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let first_match_word_position = + let first_match_first_word_position = matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0); - let first_match_token_position = + let first_match_first_token_position = matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0); - let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); - let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); + let last_match_last_word_position = + matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); + let last_match_last_token_position = + matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size + first_match_word_position - last_match_word_position; + let matches_window_len = + last_match_last_word_position - first_match_first_word_position + 1; - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_token_position..].iter().peekable(); + if crop_size >= matches_window_len { + // matches needs to be counted in the crop len. + let mut remaining_words = crop_size - matches_window_len; - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token = before_tokens.peek().map(|t| t.separator_kind()); - let after_token = after_tokens.peek().map(|t| t.separator_kind()); + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = + tokens[..first_match_first_token_position].iter().rev().peekable(); + // an iterator starting from the last match token position and going towards the end of the text. + let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable(); - match (before_token, after_token) { - // we can expand both sides. - (Some(before_token), Some(after_token)) => { - match (before_token, after_token) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - (Some(before_token_kind), Some(after_token_kind)) => { - if before_token_kind == after_token_kind { - before_tokens.next(); + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get); + let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get); - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { + match (before_token_kind, after_token_kind) { + // we can expand both sides. + (Some(before_token_kind), Some(after_token_kind)) => { + match (before_token_kind, after_token_kind) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + ( + SimpleTokenKind::Separator(before_token_separator_kind), + SimpleTokenKind::Separator(after_token_separator_kind), + ) => { + if before_token_separator_kind == after_token_separator_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if let SeparatorKind::Hard = before_token_separator_kind { after_tokens.next(); + } else { + before_tokens.next(); } - } else if before_token_kind == SeparatorKind::Hard { - after_tokens.next(); - } else { - before_tokens.next(); } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (None, Some(_)) => { - before_tokens.next(); - remaining_words -= 1; - } - // right is a word, advance right. - (Some(_), None) => { - after_tokens.next(); - remaining_words -= 1; - } - // both are words, advance left then right if remaining_word > 0. - (None, None) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { after_tokens.next(); remaining_words -= 1; } + // both are words, advance left then right if remaining_word > 0. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { + after_tokens.next(); + remaining_words -= 1; + } + } } } - } - // the end of the text is reached, advance left. - (Some(before_token), None) => { - before_tokens.next(); - if before_token.is_none() { - remaining_words -= 1; + // the end of the text is reached, advance left. + (Some(before_token_kind), None) => { + before_tokens.next(); + if let SimpleTokenKind::NotSeparator = before_token_kind { + remaining_words -= 1; + } } - } - // the start of the text is reached, advance right. - (None, Some(after_token)) => { - after_tokens.next(); - if after_token.is_none() { - remaining_words -= 1; + // the start of the text is reached, advance right. + (None, Some(after_token_kind)) => { + after_tokens.next(); + if let SimpleTokenKind::NotSeparator = after_token_kind { + remaining_words -= 1; + } } + // no more token to add. + (None, None) => break, } - // no more token to add. - (None, None) => break, } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) + } else { + // there's one match? and it's longer than the crop window, so we have to advance inward + let mut remaining_extra_words = matches_window_len - crop_size; + let mut tokens_from_end = + tokens[..=last_match_last_token_position].iter().rev().peekable(); + + while remaining_extra_words > 0 { + let token_from_end_kind = + tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO"); + if token_from_end_kind.is_not_separator() { + remaining_extra_words -= 1; + } + + tokens_from_end.next(); + } + + let crop_byte_start = if first_match_first_token_position > 0 { + &tokens[first_match_first_token_position - 1].byte_end + } else { + &0 + }; + let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO"); + + (*crop_byte_start, crop_byte_end) } - - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - - (crop_byte_start, crop_byte_end) } /// Compute the score of a match interval: @@ -416,11 +479,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { lwp } }; - - let next_match_first_word_pos = match next_match.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, - }; + let next_match_first_word_pos = next_match.get_first_word_pos(); // compute distance between matches distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; @@ -443,72 +502,96 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { /// Returns the matches interval where the score computed by match_interval_score is the best. fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { let matches_len = matches.len(); + if matches_len <= 1 { + return matches; + } + + // positions of the first and the last match of the best matches interval in `matches`. + struct BestInterval { + interval: (usize, usize), + score: (i16, i16, i16), + } + + fn save_best_interval( + best_interval: &mut Option, + interval_first: usize, + interval_last: usize, + interval_score: (i16, i16, i16), + ) { + if let Some(best_interval) = best_interval { + if interval_score > best_interval.score { + best_interval.interval = (interval_first, interval_last); + best_interval.score = interval_score; + } + } else { + *best_interval = Some(BestInterval { + interval: (interval_first, interval_last), + score: interval_score, + }); + } + } + + let mut best_interval: Option = None; // we compute the matches interval if we have at least 2 matches. - if matches_len > 1 { - // current interval positions. - let mut interval_first = 0; - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval = (0, 0); - let mut best_interval_score = self.match_interval_score(&matches[0..=0]); + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - let mut index = 1; - while index < matches_len - 1 { - let next_match = &matches[index]; + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - let interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + let interval_score = + self.match_interval_score(&matches[interval_first..=interval_last]); - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // skip for 1, because it would result in the same as our very first interval score - if index != 1 { - let interval_last = index - 1; - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - best_interval_score = interval_score; - } - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - let interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); - - if next_match_last_word_pos - interval_first_match_first_word_pos - < crop_size - { - break; - } - } + // keep interval if it's the best + save_best_interval( + &mut best_interval, + interval_first, + interval_last, + interval_score, + ); } - index += 1; - } + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + interval_first_match_first_word_pos = + matches[interval_first].get_first_word_pos(); - // compute the last interval score and compare it to the best one. - let interval_last = matches_len - 1; + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos + < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches_len - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { let interval_score = self.match_interval_score(&matches[interval_first..=interval_last]); - if interval_score > best_interval_score { - best_interval = (interval_first, interval_last); - } - - &matches[best_interval.0..=best_interval.1] - } else { - matches + save_best_interval(&mut best_interval, interval_first, interval_last, interval_score); } + + // if none of the matches fit the criteria above, default to the first one + let best_interval = best_interval.map_or((0, 0), |v| v.interval); + &matches[best_interval.0..=best_interval.1] } // Returns the formatted version of the original text. @@ -928,98 +1011,98 @@ mod tests { let format_options = FormatOptions { highlight: true, crop: Some(10) }; - // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); - // let mut matcher = builder.build(text, None); - // // should return 10 words with a marker at the start as well the end, and the highlighted matches. - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…the power to split the world between those who embraced…" - // ); + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\""); + let mut matcher = builder.build(text, None); + // should return 10 words with a marker at the start as well the end, and the highlighted matches. + insta::assert_snapshot!( + matcher.format(format_options), + @"…the power to split the world between those who embraced…" + ); - // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); - // let mut matcher = builder.build(text, None); - // // should highlight "those" and the phrase "and those". - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…groundbreaking invention had the power to split the world between…" - // ); - - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention had the power to split the world\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"The groundbreaking invention had the power to split the world…" - // ); + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); + let mut matcher = builder.build(text, None); + // should highlight "those" and the phrase "and those". + insta::assert_snapshot!( + matcher.format(format_options), + @"…groundbreaking invention had the power to split the world between…" + ); let builder = MatcherBuilder::new_test( &rtxn, &temp_index, - "\"The groundbreaking invention had the power to split the world between\"", + "\"The groundbreaking invention had the power to split the world\"", ); let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"The groundbreaking invention had the power to split the world …" + @"The groundbreaking invention had the power to split the world…" ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…between those who embraced progress and those who resisted change…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"The groundbreaking invention had the power to split the world…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention\" \"split the world between those\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…the power to split the world between those who embraced…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…between those who embraced progress and those who resisted change…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"groundbreaking invention\" \"split the world between\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…groundbreaking invention had the power to split the world between…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…the power to split the world between those who embraced…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"groundbreaking invention\" \"had the power to split the world between those\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…invention had the power to split the world between those…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"split the world between\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…groundbreaking invention had the power to split the world between…" + ); - // let builder = MatcherBuilder::new_test( - // &rtxn, - // &temp_index, - // "\"The groundbreaking invention\" \"had the power to split the world between those\"", - // ); - // let mut matcher = builder.build(text, None); - // insta::assert_snapshot!( - // matcher.format(format_options), - // @"…invention had the power to split the world between those…" - // ); + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"groundbreaking invention\" \"had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…invention had the power to split the world between those…" + ); + + let builder = MatcherBuilder::new_test( + &rtxn, + &temp_index, + "\"The groundbreaking invention\" \"had the power to split the world between those\"", + ); + let mut matcher = builder.build(text, None); + insta::assert_snapshot!( + matcher.format(format_options), + @"…invention had the power to split the world between those…" + ); } #[test] From 6d16230f17eb000407adb21dc2f3e9fa49767cc8 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:19:15 +0300 Subject: [PATCH 14/21] Refactor --- milli/src/search/new/matches/mod.rs | 327 ++++++++++++++-------------- 1 file changed, 158 insertions(+), 169 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 624287f5f..804b59553 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -181,6 +181,149 @@ impl SimpleTokenKind { } } +#[derive(PartialEq, PartialOrd)] +struct MatchIntervalScore(i16, i16, i16); + +impl MatchIntervalScore { + /// Compute the score of a match interval: + /// 1) count unique matches + /// 2) calculate distance between matches + /// 3) count ordered matches + fn new(matches: &[Match]) -> Self { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + // count score for phrases + fn tally_phrase_scores( + fwp: &usize, + lwp: &usize, + order_score: &mut i16, + distance_score: &mut i16, + ) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; + } + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + Self(uniq_score, distance_score, order_score) + } +} + +struct MatchIntervalWithScore { + interval: (usize, usize), + score: MatchIntervalScore, +} + +impl MatchIntervalWithScore { + /// Returns the matches interval where the score computed by match_interval_score is the best. + fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] { + let matches_len = matches.len(); + if matches_len <= 1 { + return matches; + } + + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval: Option = None; + let mut save_best_interval = |interval_first, interval_last, interval_score| { + let is_interval_score_better = + &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score); + if *is_interval_score_better { + best_interval = + Some(Self { interval: (interval_first, interval_last), score: interval_score }); + } + }; + + // we compute the matches interval if we have at least 2 matches. + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + let interval_score = + MatchIntervalScore::new(&matches[interval_first..=interval_last]); + + // keep interval if it's the best + save_best_interval(interval_first, interval_last, interval_score); + } + + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + interval_first_match_first_word_pos = + matches[interval_first].get_first_word_pos(); + + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos + < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches_len - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { + let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); + save_best_interval(interval_first, interval_last, interval_score); + } + + // if none of the matches fit the criteria above, default to the first one + let best_interval = best_interval.map_or((0, 0), |v| v.interval); + &matches[best_interval.0..=best_interval.1] + } +} + /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -415,14 +558,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { (crop_byte_start, crop_byte_end) } else { - // there's one match? and it's longer than the crop window, so we have to advance inward + // there's one match and it's longer than the crop window, so we have to advance inward let mut remaining_extra_words = matches_window_len - crop_size; let mut tokens_from_end = tokens[..=last_match_last_token_position].iter().rev().peekable(); while remaining_extra_words > 0 { - let token_from_end_kind = - tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO"); + let token_from_end_kind = tokens_from_end + .peek() + .map(SimpleTokenKind::get) + .expect("Expected iterator to not reach end"); if token_from_end_kind.is_not_separator() { remaining_extra_words -= 1; } @@ -435,165 +580,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } else { &0 }; - let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO"); + let crop_byte_end = tokens_from_end + .next() + .map(|t| t.byte_start) + .expect("Expected iterator to not reach end"); (*crop_byte_start, crop_byte_end) } } - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - // count score for phrases - fn tally_phrase_scores( - fwp: &usize, - lwp: &usize, - order_score: &mut i16, - distance_score: &mut i16, - ) { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; - } - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - let m_last_word_pos = match m.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - lwp - } - }; - let next_match_first_word_pos = next_match.get_first_word_pos(); - - // compute distance between matches - distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; - } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { - // in case last match is a phrase, count score for its words - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - (uniq_score, distance_score, order_score) - } - - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] { - let matches_len = matches.len(); - if matches_len <= 1 { - return matches; - } - - // positions of the first and the last match of the best matches interval in `matches`. - struct BestInterval { - interval: (usize, usize), - score: (i16, i16, i16), - } - - fn save_best_interval( - best_interval: &mut Option, - interval_first: usize, - interval_last: usize, - interval_score: (i16, i16, i16), - ) { - if let Some(best_interval) = best_interval { - if interval_score > best_interval.score { - best_interval.interval = (interval_first, interval_last); - best_interval.score = interval_score; - } - } else { - *best_interval = Some(BestInterval { - interval: (interval_first, interval_last), - score: interval_score, - }); - } - } - - let mut best_interval: Option = None; - - // we compute the matches interval if we have at least 2 matches. - // current interval positions. - let mut interval_first = 0; - let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - - for (index, next_match) in matches.iter().enumerate() { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // if index is 0 there is no last viable match - if index != 0 { - let interval_last = index - 1; - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - - // keep interval if it's the best - save_best_interval( - &mut best_interval, - interval_first, - interval_last, - interval_score, - ); - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); - - if interval_first_match_first_word_pos > next_match_last_word_pos - || next_match_last_word_pos - interval_first_match_first_word_pos - < crop_size - { - break; - } - } - } - } - - // compute the last interval score and compare it to the best one. - let interval_last = matches_len - 1; - // if it's the last match with itself, we need to make sure it's - // not a phrase longer than the crop window - if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - let interval_score = - self.match_interval_score(&matches[interval_first..=interval_last]); - save_best_interval(&mut best_interval, interval_first, interval_last, interval_score); - } - - // if none of the matches fit the criteria above, default to the first one - let best_interval = best_interval.map_or((0, 0), |v| v.interval); - &matches[best_interval.0..=best_interval.1] - } - // Returns the formatted version of the original text. pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> { if !format_options.highlight && format_options.crop.is_none() { @@ -606,7 +601,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { - let matches = self.find_best_match_interval(matches, crop_size); + let matches = MatchIntervalWithScore::find_best_match_interval( + matches, crop_size, + ); self.crop_bounds(tokens, matches, crop_size) } _ => (0, self.text.len()), @@ -1046,6 +1043,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), + // @TODO: Should probably highlight it all, even if it didn't fit the whole phrase @"The groundbreaking invention had the power to split the world…" ); @@ -1057,6 +1055,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), + // @TODO: Should probably include end of string in this case? @"…between those who embraced progress and those who resisted change…" ); @@ -1090,17 +1089,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - @"…invention had the power to split the world between those…" - ); - - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention\" \"had the power to split the world between those\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), + // @TODO: "invention" should be highlighted as well @"…invention had the power to split the world between those…" ); } From d9e4db9983e7017bb13a89f7e28def43069e1a58 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 1 Oct 2024 17:50:59 +0300 Subject: [PATCH 15/21] Refactor --- milli/src/search/new/matches/mod.rs | 38 ++++++++++++----------------- 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 804b59553..1552de8aa 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -166,19 +166,12 @@ enum SimpleTokenKind { } impl SimpleTokenKind { - fn get(token: &&Token<'_>) -> Self { + fn new(token: &&Token<'_>) -> Self { match token.kind { TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), _ => Self::NotSeparator, } } - - fn is_not_separator(&self) -> bool { - match self { - SimpleTokenKind::NotSeparator => true, - SimpleTokenKind::Separator(_) => false, - } - } } #[derive(PartialEq, PartialOrd)] @@ -259,9 +252,12 @@ impl MatchIntervalWithScore { // positions of the first and the last match of the best matches interval in `matches`. let mut best_interval: Option = None; - let mut save_best_interval = |interval_first, interval_last, interval_score| { + + let mut save_best_interval = |interval_first, interval_last| { + let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); let is_interval_score_better = &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score); + if *is_interval_score_better { best_interval = Some(Self { interval: (interval_first, interval_last), score: interval_score }); @@ -286,11 +282,8 @@ impl MatchIntervalWithScore { // if index is 0 there is no last viable match if index != 0 { let interval_last = index - 1; - let interval_score = - MatchIntervalScore::new(&matches[interval_first..=interval_last]); - // keep interval if it's the best - save_best_interval(interval_first, interval_last, interval_score); + save_best_interval(interval_first, interval_last); } // advance start of the interval while interval is longer than crop_size. @@ -314,8 +307,7 @@ impl MatchIntervalWithScore { // if it's the last match with itself, we need to make sure it's // not a phrase longer than the crop window if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); - save_best_interval(interval_first, interval_last, interval_score); + save_best_interval(interval_first, interval_last); } // if none of the matches fit the criteria above, default to the first one @@ -359,6 +351,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { + // @TODO: Shouldn't this be +1? match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { @@ -484,8 +477,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // grows the crop window peeking in both directions // until the window contains the good number of words: while remaining_words > 0 { - let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get); - let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get); + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); + let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new); match (before_token_kind, after_token_kind) { // we can expand both sides. @@ -504,7 +497,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { if remaining_words > 1 { after_tokens.next(); } - } else if let SeparatorKind::Hard = before_token_separator_kind { + } else if matches!(before_token_separator_kind, SeparatorKind::Hard) + { after_tokens.next(); } else { before_tokens.next(); @@ -536,14 +530,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // the end of the text is reached, advance left. (Some(before_token_kind), None) => { before_tokens.next(); - if let SimpleTokenKind::NotSeparator = before_token_kind { + if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } // the start of the text is reached, advance right. (None, Some(after_token_kind)) => { after_tokens.next(); - if let SimpleTokenKind::NotSeparator = after_token_kind { + if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { remaining_words -= 1; } } @@ -566,9 +560,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { while remaining_extra_words > 0 { let token_from_end_kind = tokens_from_end .peek() - .map(SimpleTokenKind::get) + .map(SimpleTokenKind::new) .expect("Expected iterator to not reach end"); - if token_from_end_kind.is_not_separator() { + if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) { remaining_extra_words -= 1; } From 37a9d64c4441bb6a4a199ad018ab4ddb44d4d958 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Tue, 1 Oct 2024 22:52:01 +0300 Subject: [PATCH 16/21] Fix failing test, refactor --- milli/src/search/new/matches/mod.rs | 44 ++++++++++++++++++----------- 1 file changed, 27 insertions(+), 17 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 1552de8aa..ae1264482 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -245,8 +245,7 @@ struct MatchIntervalWithScore { impl MatchIntervalWithScore { /// Returns the matches interval where the score computed by match_interval_score is the best. fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] { - let matches_len = matches.len(); - if matches_len <= 1 { + if matches.len() <= 1 { return matches; } @@ -303,7 +302,7 @@ impl MatchIntervalWithScore { } // compute the last interval score and compare it to the best one. - let interval_last = matches_len - 1; + let interval_last = matches.len() - 1; // if it's the last match with itself, we need to make sure it's // not a phrase longer than the crop window if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { @@ -451,28 +450,39 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let first_match_first_word_position = - matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0); - let first_match_first_token_position = - matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0); - let last_match_last_word_position = - matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0); - let last_match_last_token_position = - matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0); + let (matches_size, first_match_first_token_position, last_match_last_token_position) = + if !matches.is_empty() { + let matches_first = matches.first().unwrap(); + let matches_last = matches.last().unwrap(); - let matches_window_len = - last_match_last_word_position - first_match_first_word_position + 1; + ( + matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1, + matches_first.get_first_token_pos(), + matches_last.get_last_token_pos(), + ) + } else { + (0, 0, 0) + }; - if crop_size >= matches_window_len { + if crop_size >= matches_size { // matches needs to be counted in the crop len. - let mut remaining_words = crop_size - matches_window_len; + let mut remaining_words = crop_size - matches_size; + + let last_match_last_token_position_plus_one = last_match_last_token_position + 1; + let after_tokens_starting_index = if matches_size == 0 { + 0 + } else if last_match_last_token_position_plus_one < tokens.len() { + last_match_last_token_position_plus_one + } else { + tokens.len() + }; // create the initial state of the crop window: 2 iterators starting from the matches positions, // a reverse iterator starting from the first match token position and going towards the beginning of the text, let mut before_tokens = tokens[..first_match_first_token_position].iter().rev().peekable(); // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable(); + let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable(); // grows the crop window peeking in both directions // until the window contains the good number of words: @@ -553,7 +563,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { (crop_byte_start, crop_byte_end) } else { // there's one match and it's longer than the crop window, so we have to advance inward - let mut remaining_extra_words = matches_window_len - crop_size; + let mut remaining_extra_words = matches_size - crop_size; let mut tokens_from_end = tokens[..=last_match_last_token_position].iter().rev().peekable(); From 40336ce87d46b43123d03cb343b4d3f785001a9c Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 3 Oct 2024 10:40:14 +0300 Subject: [PATCH 17/21] Fix and refactor crop_bounds --- milli/src/search/new/matches/mod.rs | 231 ++++++++++++++-------------- 1 file changed, 113 insertions(+), 118 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ae1264482..f8d60ef54 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,6 +1,7 @@ use std::borrow::Cow; use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer}; +use either::Either; pub use matching_words::MatchingWords; use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; @@ -450,147 +451,141 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { crop_size: usize, ) -> (usize, usize) { // if there is no match, we start from the beginning of the string by default. - let (matches_size, first_match_first_token_position, last_match_last_token_position) = - if !matches.is_empty() { - let matches_first = matches.first().unwrap(); - let matches_last = matches.last().unwrap(); + let ( + mut remaining_words, + is_iterating_forward, + before_tokens_starting_index, + after_tokens_starting_index, + ) = if !matches.is_empty() { + let matches_first = matches.first().unwrap(); + let matches_last = matches.last().unwrap(); - ( - matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1, - matches_first.get_first_token_pos(), - matches_last.get_last_token_pos(), - ) + let matches_size = + matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; + + let is_crop_size_gte_match_size = crop_size >= matches_size; + let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size; + + let remaining_words = if is_crop_size_gte_match_size { + crop_size - matches_size } else { - (0, 0, 0) + // in case matches size is greater than crop size, which implies there's only one match, + // we count words backwards, because we have to remove words, as they're extra words outside of + // crop window + matches_size - crop_size }; - if crop_size >= matches_size { - // matches needs to be counted in the crop len. - let mut remaining_words = crop_size - matches_size; - - let last_match_last_token_position_plus_one = last_match_last_token_position + 1; let after_tokens_starting_index = if matches_size == 0 { 0 - } else if last_match_last_token_position_plus_one < tokens.len() { - last_match_last_token_position_plus_one } else { - tokens.len() + let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1; + if last_match_last_token_position_plus_one < tokens.len() { + last_match_last_token_position_plus_one + } else { + // we have matched the end of possible tokens, there's nothing to advance + tokens.len() - 1 + } }; - // create the initial state of the crop window: 2 iterators starting from the matches positions, - // a reverse iterator starting from the first match token position and going towards the beginning of the text, - let mut before_tokens = - tokens[..first_match_first_token_position].iter().rev().peekable(); - // an iterator starting from the last match token position and going towards the end of the text. - let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable(); + ( + remaining_words, + is_iterating_forward, + if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 }, + after_tokens_starting_index, + ) + } else { + (crop_size, true, 0, 0) + }; - // grows the crop window peeking in both directions - // until the window contains the good number of words: - while remaining_words > 0 { - let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); - let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new); + // create the initial state of the crop window: 2 iterators starting from the matches positions, + // a reverse iterator starting from the first match token position and going towards the beginning of the text, + let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable(); + // an iterator ... + let mut after_tokens = if is_iterating_forward { + // ... starting from the last match token position and going towards the end of the text. + Either::Left(tokens[after_tokens_starting_index..].iter().peekable()) + } else { + // ... starting from the last match token position and going towards the start of the text. + Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable()) + }; - match (before_token_kind, after_token_kind) { - // we can expand both sides. - (Some(before_token_kind), Some(after_token_kind)) => { - match (before_token_kind, after_token_kind) { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator separator side. - ( - SimpleTokenKind::Separator(before_token_separator_kind), - SimpleTokenKind::Separator(after_token_separator_kind), - ) => { - if before_token_separator_kind == after_token_separator_kind { - before_tokens.next(); + // grows the crop window peeking in both directions + // until the window contains the good number of words: + while remaining_words > 0 { + let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new); + let after_token_kind = + after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new); - // this avoid having an ending separator before crop marker. - if remaining_words > 1 { - after_tokens.next(); - } - } else if matches!(before_token_separator_kind, SeparatorKind::Hard) - { - after_tokens.next(); - } else { - before_tokens.next(); - } - } - // if one of the tokens is a word, we expend in the side of the word. - // left is a word, advance left. - (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { + match (before_token_kind, after_token_kind) { + // we can expand both sides. + (Some(before_token_kind), Some(after_token_kind)) => { + match (before_token_kind, after_token_kind) { + // if they are both separators and are the same kind then advance both, + // or expand in the soft separator separator side. + ( + SimpleTokenKind::Separator(before_token_separator_kind), + SimpleTokenKind::Separator(after_token_separator_kind), + ) => { + if before_token_separator_kind == after_token_separator_kind { + before_tokens.next(); + + // this avoid having an ending separator before crop marker. + if remaining_words > 1 { + after_tokens.next(); + } + } else if matches!(before_token_separator_kind, SeparatorKind::Hard) { + after_tokens.next(); + } else { before_tokens.next(); - remaining_words -= 1; } - // right is a word, advance right. - (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { + } + // if one of the tokens is a word, we expend in the side of the word. + // left is a word, advance left. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => { + before_tokens.next(); + remaining_words -= 1; + } + // right is a word, advance right. + (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => { + after_tokens.next(); + remaining_words -= 1; + } + // both are words, advance left then right if remaining_word > 0. + (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { + before_tokens.next(); + remaining_words -= 1; + + if remaining_words > 0 { after_tokens.next(); remaining_words -= 1; } - // both are words, advance left then right if remaining_word > 0. - (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => { - before_tokens.next(); - remaining_words -= 1; - - if remaining_words > 0 { - after_tokens.next(); - remaining_words -= 1; - } - } } } - // the end of the text is reached, advance left. - (Some(before_token_kind), None) => { - before_tokens.next(); - if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { - remaining_words -= 1; - } - } - // the start of the text is reached, advance right. - (None, Some(after_token_kind)) => { - after_tokens.next(); - if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { - remaining_words -= 1; - } - } - // no more token to add. - (None, None) => break, } - } - - // finally, keep the byte index of each bound of the crop window. - let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); - let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - - (crop_byte_start, crop_byte_end) - } else { - // there's one match and it's longer than the crop window, so we have to advance inward - let mut remaining_extra_words = matches_size - crop_size; - let mut tokens_from_end = - tokens[..=last_match_last_token_position].iter().rev().peekable(); - - while remaining_extra_words > 0 { - let token_from_end_kind = tokens_from_end - .peek() - .map(SimpleTokenKind::new) - .expect("Expected iterator to not reach end"); - if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) { - remaining_extra_words -= 1; + // the end of the text is reached, advance left. + (Some(before_token_kind), None) => { + before_tokens.next(); + if matches!(before_token_kind, SimpleTokenKind::NotSeparator) { + remaining_words -= 1; + } } - - tokens_from_end.next(); + // the start of the text is reached, advance right. + (None, Some(after_token_kind)) => { + after_tokens.next(); + if matches!(after_token_kind, SimpleTokenKind::NotSeparator) { + remaining_words -= 1; + } + } + // no more token to add. + (None, None) => break, } - - let crop_byte_start = if first_match_first_token_position > 0 { - &tokens[first_match_first_token_position - 1].byte_end - } else { - &0 - }; - let crop_byte_end = tokens_from_end - .next() - .map(|t| t.byte_start) - .expect("Expected iterator to not reach end"); - - (*crop_byte_start, crop_byte_end) } + + // finally, keep the byte index of each bound of the crop window. + let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); + let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); + + (crop_byte_start, crop_byte_end) } // Returns the formatted version of the original text. From 8221c94e7f5666c73944cc5f57211a0eb4035b59 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Thu, 3 Oct 2024 15:37:51 +0300 Subject: [PATCH 18/21] Split into multiple files, refactor --- .../search/new/matches/best_match_interval.rs | 139 ++++++++++ milli/src/search/new/matches/match.rs | 62 +++++ milli/src/search/new/matches/mod.rs | 244 +----------------- .../search/new/matches/simple_token_kind.rs | 15 ++ 4 files changed, 230 insertions(+), 230 deletions(-) create mode 100644 milli/src/search/new/matches/best_match_interval.rs create mode 100644 milli/src/search/new/matches/match.rs create mode 100644 milli/src/search/new/matches/simple_token_kind.rs diff --git a/milli/src/search/new/matches/best_match_interval.rs b/milli/src/search/new/matches/best_match_interval.rs new file mode 100644 index 000000000..a6497f351 --- /dev/null +++ b/milli/src/search/new/matches/best_match_interval.rs @@ -0,0 +1,139 @@ +use super::matching_words::WordId; +use super::{Match, MatchPosition}; + +struct MatchIntervalWithScore { + interval: [usize; 2], + score: [i16; 3], +} + +// count score for phrases +fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) { + let words_in_phrase_minus_one = (lwp - fwp) as i16; + // will always be ordered, so +1 for each space between words + *order_score += words_in_phrase_minus_one; + // distance will always be 1, so -1 for each space between words + *distance_score -= words_in_phrase_minus_one; +} + +/// Compute the score of a match interval: +/// 1) count unique matches +/// 2) calculate distance between matches +/// 3) count ordered matches +fn get_interval_score(matches: &[Match]) -> [i16; 3] { + let mut ids: Vec = Vec::with_capacity(matches.len()); + let mut order_score = 0; + let mut distance_score = 0; + + let mut iter = matches.iter().peekable(); + while let Some(m) = iter.next() { + if let Some(next_match) = iter.peek() { + // if matches are ordered + if next_match.ids.iter().min() > m.ids.iter().min() { + order_score += 1; + } + + let m_last_word_pos = match m.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => { + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + lwp + } + }; + let next_match_first_word_pos = next_match.get_first_word_pos(); + + // compute distance between matches + distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; + } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position { + // in case last match is a phrase, count score for its words + tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); + } + + ids.extend(m.ids.iter()); + } + + ids.sort_unstable(); + ids.dedup(); + let uniq_score = ids.len() as i16; + + // rank by unique match count, then by distance between matches, then by ordered match count. + [uniq_score, distance_score, order_score] +} + +/// Returns the first and last match where the score computed by match_interval_score is the best. +pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] { + if matches.is_empty() { + panic!("`matches` should not be empty at this point"); + } + + // positions of the first and the last match of the best matches interval in `matches`. + let mut best_interval: Option = None; + + let mut save_best_interval = |interval_first, interval_last| { + let interval_score = get_interval_score(&matches[interval_first..=interval_last]); + let is_interval_score_better = &best_interval + .as_ref() + .map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score); + + if *is_interval_score_better { + best_interval = Some(MatchIntervalWithScore { + interval: [interval_first, interval_last], + score: interval_score, + }); + } + }; + + // we compute the matches interval if we have at least 2 matches. + // current interval positions. + let mut interval_first = 0; + let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + for (index, next_match) in matches.iter().enumerate() { + // if next match would make interval gross more than crop_size, + // we compare the current interval with the best one, + // then we increase `interval_first` until next match can be added. + let next_match_last_word_pos = next_match.get_last_word_pos(); + + // if the next match would mean that we pass the crop size window, + // we take the last valid match, that didn't pass this boundry, which is `index` - 1, + // and calculate a score for it, and check if it's better than our best so far + if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { + // if index is 0 there is no last viable match + if index != 0 { + let interval_last = index - 1; + // keep interval if it's the best + save_best_interval(interval_first, interval_last); + } + + // advance start of the interval while interval is longer than crop_size. + loop { + interval_first += 1; + if interval_first == matches.len() { + interval_first -= 1; + break; + } + + interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); + + if interval_first_match_first_word_pos > next_match_last_word_pos + || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size + { + break; + } + } + } + } + + // compute the last interval score and compare it to the best one. + let interval_last = matches.len() - 1; + // if it's the last match with itself, we need to make sure it's + // not a phrase longer than the crop window + if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { + save_best_interval(interval_first, interval_last); + } + + // if none of the matches fit the criteria above, default to the first one + best_interval.map_or( + [&matches[0], &matches[0]], + |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]], + ) +} diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs new file mode 100644 index 000000000..cc08b006c --- /dev/null +++ b/milli/src/search/new/matches/match.rs @@ -0,0 +1,62 @@ +use super::matching_words::WordId; + +#[derive(Clone, Debug)] +pub enum MatchPosition { + Word { + // position of the word in the whole text. + word_position: usize, + // position of the token in the whole text. + token_position: usize, + }, + Phrase { + // position of the first and last word in the phrase in the whole text. + word_positions: [usize; 2], + // position of the first and last token in the phrase in the whole text. + token_positions: [usize; 2], + }, +} + +#[derive(Clone, Debug)] +pub struct Match { + pub match_len: usize, + // ids of the query words that matches. + pub ids: Vec, + pub position: MatchPosition, +} + +impl Match { + pub(super) fn get_first_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp, + } + } + + pub(super) fn get_last_word_pos(&self) -> usize { + match self.position { + MatchPosition::Word { word_position, .. } => word_position, + MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp, + } + } + + pub(super) fn get_first_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp, + } + } + + pub(super) fn get_last_token_pos(&self) -> usize { + match self.position { + MatchPosition::Word { token_position, .. } => token_position, + MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp, + } + } + + pub(super) fn get_word_count(&self) -> usize { + match self.position { + MatchPosition::Word { .. } => 1, + MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1, + } + } +} diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index f8d60ef54..3df361702 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,12 +1,16 @@ -use std::borrow::Cow; +mod best_match_interval; +mod r#match; +mod matching_words; +mod simple_token_kind; -use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer}; +use charabia::{Language, SeparatorKind, Token, Tokenizer}; use either::Either; pub use matching_words::MatchingWords; -use matching_words::{MatchType, PartialMatch, WordId}; +use matching_words::{MatchType, PartialMatch}; +use r#match::{Match, MatchPosition}; use serde::Serialize; - -pub mod matching_words; +use simple_token_kind::SimpleTokenKind; +use std::borrow::Cow; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -94,228 +98,12 @@ impl FormatOptions { } } -#[derive(Clone, Debug)] -pub enum MatchPosition { - Word { - // position of the word in the whole text. - word_position: usize, - // position of the token in the whole text. - token_position: usize, - }, - Phrase { - // position of the first and last word in the phrase in the whole text. - word_positions: (usize, usize), - // position of the first and last token in the phrase in the whole text. - token_positions: (usize, usize), - }, -} - -#[derive(Clone, Debug)] -pub struct Match { - match_len: usize, - // ids of the query words that matches. - ids: Vec, - position: MatchPosition, -} - -impl Match { - fn get_first_word_pos(&self) -> usize { - match self.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp, - } - } - - fn get_last_word_pos(&self) -> usize { - match self.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp, - } - } - - fn get_first_token_pos(&self) -> usize { - match self.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp, - } - } - - fn get_last_token_pos(&self) -> usize { - match self.position { - MatchPosition::Word { token_position, .. } => token_position, - MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp, - } - } - - fn get_word_count(&self) -> usize { - match self.position { - MatchPosition::Word { .. } => 1, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1, - } - } -} - #[derive(Serialize, Debug, Clone, PartialEq, Eq)] pub struct MatchBounds { pub start: usize, pub length: usize, } -enum SimpleTokenKind { - Separator(SeparatorKind), - NotSeparator, -} - -impl SimpleTokenKind { - fn new(token: &&Token<'_>) -> Self { - match token.kind { - TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), - _ => Self::NotSeparator, - } - } -} - -#[derive(PartialEq, PartialOrd)] -struct MatchIntervalScore(i16, i16, i16); - -impl MatchIntervalScore { - /// Compute the score of a match interval: - /// 1) count unique matches - /// 2) calculate distance between matches - /// 3) count ordered matches - fn new(matches: &[Match]) -> Self { - let mut ids: Vec = Vec::with_capacity(matches.len()); - let mut order_score = 0; - let mut distance_score = 0; - - // count score for phrases - fn tally_phrase_scores( - fwp: &usize, - lwp: &usize, - order_score: &mut i16, - distance_score: &mut i16, - ) { - let words_in_phrase_minus_one = (lwp - fwp) as i16; - // will always be ordered, so +1 for each space between words - *order_score += words_in_phrase_minus_one; - // distance will always be 1, so -1 for each space between words - *distance_score -= words_in_phrase_minus_one; - } - - let mut iter = matches.iter().peekable(); - while let Some(m) = iter.next() { - if let Some(next_match) = iter.peek() { - // if matches are ordered - if next_match.ids.iter().min() > m.ids.iter().min() { - order_score += 1; - } - - let m_last_word_pos = match m.position { - MatchPosition::Word { word_position, .. } => word_position, - MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => { - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - lwp - } - }; - let next_match_first_word_pos = next_match.get_first_word_pos(); - - // compute distance between matches - distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16; - } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position { - // in case last match is a phrase, count score for its words - tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score); - } - - ids.extend(m.ids.iter()); - } - - ids.sort_unstable(); - ids.dedup(); - let uniq_score = ids.len() as i16; - - // rank by unique match count, then by distance between matches, then by ordered match count. - Self(uniq_score, distance_score, order_score) - } -} - -struct MatchIntervalWithScore { - interval: (usize, usize), - score: MatchIntervalScore, -} - -impl MatchIntervalWithScore { - /// Returns the matches interval where the score computed by match_interval_score is the best. - fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] { - if matches.len() <= 1 { - return matches; - } - - // positions of the first and the last match of the best matches interval in `matches`. - let mut best_interval: Option = None; - - let mut save_best_interval = |interval_first, interval_last| { - let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]); - let is_interval_score_better = - &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score); - - if *is_interval_score_better { - best_interval = - Some(Self { interval: (interval_first, interval_last), score: interval_score }); - } - }; - - // we compute the matches interval if we have at least 2 matches. - // current interval positions. - let mut interval_first = 0; - let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos(); - - for (index, next_match) in matches.iter().enumerate() { - // if next match would make interval gross more than crop_size, - // we compare the current interval with the best one, - // then we increase `interval_first` until next match can be added. - let next_match_last_word_pos = next_match.get_last_word_pos(); - - // if the next match would mean that we pass the crop size window, - // we take the last valid match, that didn't pass this boundry, which is `index` - 1, - // and calculate a score for it, and check if it's better than our best so far - if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size { - // if index is 0 there is no last viable match - if index != 0 { - let interval_last = index - 1; - // keep interval if it's the best - save_best_interval(interval_first, interval_last); - } - - // advance start of the interval while interval is longer than crop_size. - loop { - interval_first += 1; - interval_first_match_first_word_pos = - matches[interval_first].get_first_word_pos(); - - if interval_first_match_first_word_pos > next_match_last_word_pos - || next_match_last_word_pos - interval_first_match_first_word_pos - < crop_size - { - break; - } - } - } - } - - // compute the last interval score and compare it to the best one. - let interval_last = matches.len() - 1; - // if it's the last match with itself, we need to make sure it's - // not a phrase longer than the crop window - if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size { - save_best_interval(interval_first, interval_last); - } - - // if none of the matches fit the criteria above, default to the first one - let best_interval = best_interval.map_or((0, 0), |v| v.interval); - &matches[best_interval.0..=best_interval.1] - } -} - /// Structure used to analyze a string, compute words that match, /// and format the source string, returning a highlighted and cropped sub-string. pub struct Matcher<'t, 'tokenizer, 'b, 'lang> { @@ -355,8 +143,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { - word_positions: (first_word_position, word_position), - token_positions: (first_token_position, token_position), + word_positions: [first_word_position, word_position], + token_positions: [first_token_position, token_position], }, }); @@ -450,15 +238,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { matches: &[Match], crop_size: usize, ) -> (usize, usize) { - // if there is no match, we start from the beginning of the string by default. let ( mut remaining_words, is_iterating_forward, before_tokens_starting_index, after_tokens_starting_index, ) = if !matches.is_empty() { - let matches_first = matches.first().unwrap(); - let matches_last = matches.last().unwrap(); + let [matches_first, matches_last] = + best_match_interval::find_best_match_interval(matches, crop_size); let matches_size = matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1; @@ -600,9 +387,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { // crop around the best interval. let (byte_start, byte_end) = match format_options.crop { Some(crop_size) if crop_size > 0 => { - let matches = MatchIntervalWithScore::find_best_match_interval( - matches, crop_size, - ); self.crop_bounds(tokens, matches, crop_size) } _ => (0, self.text.len()), @@ -625,7 +409,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let token = &tokens[token_position]; (&token.byte_start, &token.byte_end) } - MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => { + MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { (&tokens[ftp].byte_start, &tokens[ltp].byte_end) } }; diff --git a/milli/src/search/new/matches/simple_token_kind.rs b/milli/src/search/new/matches/simple_token_kind.rs new file mode 100644 index 000000000..b34a8c985 --- /dev/null +++ b/milli/src/search/new/matches/simple_token_kind.rs @@ -0,0 +1,15 @@ +use charabia::{SeparatorKind, Token, TokenKind}; + +pub enum SimpleTokenKind { + Separator(SeparatorKind), + NotSeparator, +} + +impl SimpleTokenKind { + pub fn new(token: &&Token<'_>) -> Self { + match token.kind { + TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind), + _ => Self::NotSeparator, + } + } +} From c3de3a9ab75e6be99314400137b8329cdf46ff12 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 4 Oct 2024 11:30:31 +0300 Subject: [PATCH 19/21] Refactor --- milli/src/search/new/matches/matching_words.rs | 12 +++--------- milli/src/search/new/matches/mod.rs | 1 - 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 4deaff6a0..e4d2785ca 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -130,7 +130,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> { word.map(|word| self.matching_words.word_interner.get(word).as_str()) }) .collect(); - let partial = PartialMatch { matching_words: words, ids, char_len: 0 }; + let partial = PartialMatch { matching_words: words, ids }; partial.match_token(self.token).or_else(|| self.next()) } @@ -158,7 +158,6 @@ pub enum MatchType<'a> { pub struct PartialMatch<'a> { matching_words: Vec>, ids: &'a RangeInclusive, - char_len: usize, } impl<'a> PartialMatch<'a> { @@ -176,25 +175,20 @@ impl<'a> PartialMatch<'a> { None => token.is_stopword(), }; - let char_len = token.char_end - token.char_start; // if there are remaining words to match in the phrase and the current token is matching, // return a new Partial match allowing the highlighter to continue. if is_matching && matching_words.len() > 1 { matching_words.remove(0); - Some(MatchType::Partial(Self { matching_words, ids, char_len })) + Some(MatchType::Partial(Self { matching_words, ids })) // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { - Some(MatchType::Full { char_len, ids }) + Some(MatchType::Full { char_len: token.char_end - token.char_start, ids }) // if the current token doesn't match, return None to break the match sequence. } else { None } } - - pub fn char_len(&self) -> usize { - self.char_len - } } impl fmt::Debug for MatchingWords { diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 3df361702..9ca560529 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -139,7 +139,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - // @TODO: Shouldn't this be +1? match_len: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { From 03579aba13853560059cec3c881e284b4f7a307a Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Fri, 4 Oct 2024 11:38:47 +0300 Subject: [PATCH 20/21] Adjust test --- milli/src/search/new/matches/mod.rs | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 9ca560529..ac0fb7e7b 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -798,12 +798,12 @@ mod tests { @"…the power to split the world between those who embraced…" ); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\""); + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\""); let mut matcher = builder.build(text, None); // should highlight "those" and the phrase "and those". insta::assert_snapshot!( matcher.format(format_options), - @"…groundbreaking invention had the power to split the world between…" + @"…world between those who embraced progress and those who resisted…" ); let builder = MatcherBuilder::new_test( @@ -841,17 +841,6 @@ mod tests { @"…between those who embraced progress and those who resisted change…" ); - let builder = MatcherBuilder::new_test( - &rtxn, - &temp_index, - "\"The groundbreaking invention\" \"split the world between those\"", - ); - let mut matcher = builder.build(text, None); - insta::assert_snapshot!( - matcher.format(format_options), - @"…the power to split the world between those who embraced…" - ); - let builder = MatcherBuilder::new_test( &rtxn, &temp_index, From e51e6f902a13525610c4d0a81125c7292da3de36 Mon Sep 17 00:00:00 2001 From: "F. Levi" <55688616+flevi29@users.noreply.github.com> Date: Sat, 19 Oct 2024 13:42:02 +0300 Subject: [PATCH 21/21] Highlight partially cropped matches too --- milli/src/search/new/matches/match.rs | 2 +- .../src/search/new/matches/matching_words.rs | 25 +++-- milli/src/search/new/matches/mod.rs | 94 ++++++++++--------- 3 files changed, 67 insertions(+), 54 deletions(-) diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs index cc08b006c..2eef4d5a6 100644 --- a/milli/src/search/new/matches/match.rs +++ b/milli/src/search/new/matches/match.rs @@ -18,7 +18,7 @@ pub enum MatchPosition { #[derive(Clone, Debug)] pub struct Match { - pub match_len: usize, + pub char_count: usize, // ids of the query words that matches. pub ids: Vec, pub position: MatchPosition, diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index e4d2785ca..1f30a17ad 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -86,14 +86,17 @@ impl MatchingWords { continue; }; let prefix_length = char_index + c.len_utf8(); - let char_len = token.original_lengths(prefix_length).0; + let (char_count, byte_len) = token.original_lengths(prefix_length); let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { ids, char_count, byte_len }); // else we exact match the token. } else if token.lemma() == word { - let char_len = token.char_end - token.char_start; let ids = &located_words.positions; - return Some(MatchType::Full { char_len, ids }); + return Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }); } } } @@ -149,7 +152,7 @@ pub type WordId = u16; /// In these cases we need to match consecutively several tokens to consider that the match is full. #[derive(Debug, PartialEq)] pub enum MatchType<'a> { - Full { char_len: usize, ids: &'a RangeInclusive }, + Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive }, Partial(PartialMatch<'a>), } @@ -183,7 +186,11 @@ impl<'a> PartialMatch<'a> { // if there is no remaining word to match in the phrase and the current token is matching, // return a Full match. } else if is_matching { - Some(MatchType::Full { char_len: token.char_end - token.char_start, ids }) + Some(MatchType::Full { + char_count: token.char_end - token.char_start, + byte_len: token.byte_end - token.byte_start, + ids, + }) // if the current token doesn't match, return None to break the match sequence. } else { None @@ -270,7 +277,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(0..=0) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) }) ); assert_eq!( matching_words @@ -294,7 +301,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words @@ -306,7 +313,7 @@ pub(crate) mod tests { ..Default::default() }) .next(), - Some(MatchType::Full { char_len: 5, ids: &(2..=2) }) + Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) }) ); assert_eq!( matching_words diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index ac0fb7e7b..80e3ec7b2 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -10,7 +10,10 @@ use matching_words::{MatchType, PartialMatch}; use r#match::{Match, MatchPosition}; use serde::Serialize; use simple_token_kind::SimpleTokenKind; -use std::borrow::Cow; +use std::{ + borrow::Cow, + cmp::{max, min}, +}; const DEFAULT_CROP_MARKER: &str = "…"; const DEFAULT_HIGHLIGHT_PREFIX: &str = ""; @@ -139,7 +142,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { Some(MatchType::Full { ids, .. }) => { // save the token that closes the partial match as a match. matches.push(Match { - match_len: word.char_end - *first_word_char_start, + char_count: word.char_end - *first_word_char_start, ids: ids.clone().collect(), position: MatchPosition::Phrase { word_positions: [first_word_position, word_position], @@ -182,10 +185,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { match match_type { // we match, we save the current token as a match, // then we continue the rest of the tokens. - MatchType::Full { char_len, ids } => { + MatchType::Full { ids, char_count, .. } => { let ids: Vec<_> = ids.clone().collect(); matches.push(Match { - match_len: char_len, + char_count, ids, position: MatchPosition::Word { word_position, token_position }, }); @@ -224,19 +227,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { .iter() .map(|m| MatchBounds { start: tokens[m.get_first_token_pos()].byte_start, - length: m.match_len, + // TODO: Why is this in chars, while start is in bytes? + length: m.char_count, }) .collect(), } } /// Returns the bounds in byte index of the crop window. - fn crop_bounds( - &self, - tokens: &[Token<'_>], - matches: &[Match], - crop_size: usize, - ) -> (usize, usize) { + fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] { let ( mut remaining_words, is_iterating_forward, @@ -371,7 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end); let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start); - (crop_byte_start, crop_byte_end) + [crop_byte_start, crop_byte_end] } // Returns the formatted version of the original text. @@ -382,78 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> { } else { match &self.matches { Some((tokens, matches)) => { - // If the text has to be cropped, - // crop around the best interval. - let (byte_start, byte_end) = match format_options.crop { + // If the text has to be cropped, crop around the best interval. + let [crop_byte_start, crop_byte_end] = match format_options.crop { Some(crop_size) if crop_size > 0 => { self.crop_bounds(tokens, matches, crop_size) } - _ => (0, self.text.len()), + _ => [0, self.text.len()], }; let mut formatted = Vec::new(); // push crop marker if it's not the start of the text. - if byte_start > 0 && !self.crop_marker.is_empty() { + if crop_byte_start > 0 && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } - let mut byte_index = byte_start; + let mut byte_index = crop_byte_start; if format_options.highlight { // insert highlight markers around matches. for m in matches { - let (current_byte_start, current_byte_end) = match m.position { + let [m_byte_start, m_byte_end] = match m.position { MatchPosition::Word { token_position, .. } => { let token = &tokens[token_position]; - (&token.byte_start, &token.byte_end) + [&token.byte_start, &token.byte_end] } MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => { - (&tokens[ftp].byte_start, &tokens[ltp].byte_end) + [&tokens[ftp].byte_start, &tokens[ltp].byte_end] } }; - // skip matches out of the crop window. - if *current_byte_start < byte_start || *current_byte_end > byte_end { + // skip matches out of the crop window + if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end { continue; } - if byte_index < *current_byte_start { - formatted.push(&self.text[byte_index..*current_byte_start]); + // adjust start and end to the crop window size + let [m_byte_start, m_byte_end] = [ + max(m_byte_start, &crop_byte_start), + min(m_byte_end, &crop_byte_end), + ]; + + // push text that is positioned before our matches + if byte_index < *m_byte_start { + formatted.push(&self.text[byte_index..*m_byte_start]); } - let highlight_byte_index = self.text[*current_byte_start..] - .char_indices() - .enumerate() - .find(|(i, _)| *i == m.match_len) - .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start); - formatted.push(self.highlight_prefix); - formatted.push(&self.text[*current_byte_start..highlight_byte_index]); + + // TODO: This is additional work done, charabia::token::Token byte_len + // should already get us the original byte length, however, that doesn't work as + // it's supposed to, investigate why + let highlight_byte_index = self.text[*m_byte_start..] + .char_indices() + .nth(m.char_count) + .map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end)); + formatted.push(&self.text[*m_byte_start..highlight_byte_index]); + formatted.push(self.highlight_suffix); // if it's a prefix highlight, we put the end of the word after the highlight marker. - if highlight_byte_index < *current_byte_end { - formatted.push(&self.text[highlight_byte_index..*current_byte_end]); + if highlight_byte_index < *m_byte_end { + formatted.push(&self.text[highlight_byte_index..*m_byte_end]); } - byte_index = *current_byte_end; + byte_index = *m_byte_end; } } // push the rest of the text between last match and the end of crop. - if byte_index < byte_end { - formatted.push(&self.text[byte_index..byte_end]); + if byte_index < crop_byte_end { + formatted.push(&self.text[byte_index..crop_byte_end]); } // push crop marker if it's not the end of the text. - if byte_end < self.text.len() && !self.crop_marker.is_empty() { + if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() { formatted.push(self.crop_marker); } if formatted.len() == 1 { // avoid concatenating if there is already 1 slice. - Cow::Borrowed(&self.text[byte_start..byte_end]) + Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end]) } else { Cow::Owned(formatted.concat()) } @@ -825,8 +833,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: Should probably highlight it all, even if it didn't fit the whole phrase - @"The groundbreaking invention had the power to split the world…" + @"The groundbreaking invention had the power to split the world…" ); let builder = MatcherBuilder::new_test( @@ -837,7 +844,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: Should probably include end of string in this case? + // TODO: Should include exclamation mark without crop markers @"…between those who embraced progress and those who resisted change…" ); @@ -860,8 +867,7 @@ mod tests { let mut matcher = builder.build(text, None); insta::assert_snapshot!( matcher.format(format_options), - // @TODO: "invention" should be highlighted as well - @"…invention had the power to split the world between those…" + @"…invention had the power to split the world between those…" ); }