From edcb4c60ba0bc416152bdfd931598bfa0df87467 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Thu, 12 Sep 2024 09:44:37 +0300
Subject: [PATCH 01/21] Change Matcher so that phrases are counted as one
 instead of word by word

---
 milli/src/search/new/matches/mod.rs | 45 +++++++++++------------------
 1 file changed, 17 insertions(+), 28 deletions(-)
diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 4688b8f32..6ddb81c6a 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -132,37 +132,21 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             mut partial: PartialMatch<'a>,
             token_position: usize,
             word_position: usize,
+            first_word_char_start: &usize,
             words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
             matches: &mut Vec<Match>,
         ) -> bool {
-            let mut potential_matches = vec![(token_position, word_position, partial.char_len())];
-
-            for (token_position, word_position, word) in words_positions {
+            for (_, _, word) in words_positions {
                 partial = match partial.match_token(word) {
                     // token matches the partial match, but the match is not full,
                     // we temporarily save the current token then we try to match the next one.
-                    Some(MatchType::Partial(partial)) => {
-                        potential_matches.push((token_position, word_position, partial.char_len()));
-                        partial
-                    }
+                    Some(MatchType::Partial(partial)) => partial,
                     // partial match is now full, we keep this matches and we advance positions
-                    Some(MatchType::Full { char_len, ids }) => {
-                        let ids: Vec<_> = ids.clone().collect();
-                        // save previously matched tokens as matches.
-                        let iter = potential_matches.into_iter().map(
-                            |(token_position, word_position, match_len)| Match {
-                                match_len,
-                                ids: ids.clone(),
-                                word_position,
-                                token_position,
-                            },
-                        );
-                        matches.extend(iter);
-
+                    Some(MatchType::Full { ids, .. }) => {
                         // save the token that closes the partial match as a match.
                         matches.push(Match {
-                            match_len: char_len,
-                            ids,
+                            match_len: word.char_end - first_word_char_start,
+                            ids: ids.clone().collect(),
                             word_position,
                             token_position,
                         });
@@ -221,6 +205,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                             partial,
                             token_position,
                             word_position,
+                            &word.char_start,
                             &mut wp,
                             &mut matches,
                         ) {
@@ -472,15 +457,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                                 .enumerate()
                                 .find(|(i, _)| *i == m.match_len)
                                 .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
+
                             formatted.push(self.highlight_prefix);
                             formatted.push(&self.text[token.byte_start..highlight_byte_index]);
                             formatted.push(self.highlight_suffix);
+
                             // if it's a prefix highlight, we put the end of the word after the highlight marker.
                             if highlight_byte_index < token.byte_end {
                                 formatted.push(&self.text[highlight_byte_index..token.byte_end]);
                             }
 
-                            byte_index = token.byte_end;
+                            byte_index = token.byte_start + m.match_len;
                         }
                     }
 
@@ -821,22 +808,24 @@ mod tests {
     fn format_highlight_crop_phrase_query() {
         //! testing: https://github.com/meilisearch/meilisearch/issues/3975
         let temp_index = TempIndex::new();
+
+        let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
         temp_index
             .add_documents(documents!([
-                { "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
+                { "id": 1, "text": text }
             ]))
             .unwrap();
+
         let rtxn = temp_index.read_txn().unwrap();
 
         let format_options = FormatOptions { highlight: true, crop: Some(10) };
-        let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
 
         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
         let mut matcher = builder.build(text, None);
         // should return 10 words with a marker at the start as well the end, and the highlighted matches.
         insta::assert_snapshot!(
             matcher.format(format_options),
-            @"…had the power to split <em>the</em> <em>world</em> between those who…"
+            @"…had the power to split <em>the world</em> between those who…"
         );
 
         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
@@ -844,7 +833,7 @@ mod tests {
         // should highlight "those" and the phrase "and those".
         insta::assert_snapshot!(
             matcher.format(format_options),
-            @"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
+            @"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
         );
     }
 
@@ -900,7 +889,7 @@ mod tests {
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
-            @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
+            @"_the_ _do or_ die can't be he do and or isn'_t he_"
         );
     }
 }

From e7af499314f24e51f1bff27ff231ceb898aa27a1 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Thu, 12 Sep 2024 16:58:13 +0300
Subject: [PATCH 02/21] Improve changes to Matcher

---
 milli/src/search/new/matches/mod.rs | 136 +++++++++++++++++++++-------
 1 file changed, 104 insertions(+), 32 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 6ddb81c6a..26dd6f6e8 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -93,15 +93,28 @@ impl FormatOptions {
     }
 }
 
+#[derive(Clone, Debug)]
+pub enum MatchPosition {
+    Word {
+        // position of the word in the whole text.
+        word_position: usize,
+        // position of the token in the whole text.
+        token_position: usize,
+    },
+    Phrase {
+        // position of the first and last word in the phrase in the whole text.
+        word_positions: (usize, usize),
+        // position of the first and last token in the phrase in the whole text.
+        token_positions: (usize, usize),
+    },
+}
+
 #[derive(Clone, Debug)]
 pub struct Match {
     match_len: usize,
     // ids of the query words that matches.
     ids: Vec<WordId>,
-    // position of the word in the whole text.
-    word_position: usize,
-    // position of the token in the whole text.
-    token_position: usize,
+    position: MatchPosition,
 }
 
 #[derive(Serialize, Debug, Clone, PartialEq, Eq)]
@@ -130,13 +143,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         /// compute_partial_match peek into next words to validate if the match is complete.
         fn compute_partial_match<'a>(
             mut partial: PartialMatch<'a>,
-            token_position: usize,
-            word_position: usize,
+            first_token_position: usize,
+            first_word_position: usize,
             first_word_char_start: &usize,
             words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
             matches: &mut Vec<Match>,
         ) -> bool {
-            for (_, _, word) in words_positions {
+            for (token_position, word_position, word) in words_positions {
                 partial = match partial.match_token(word) {
                     // token matches the partial match, but the match is not full,
                     // we temporarily save the current token then we try to match the next one.
@@ -145,10 +158,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     Some(MatchType::Full { ids, .. }) => {
                         // save the token that closes the partial match as a match.
                         matches.push(Match {
-                            match_len: word.char_end - first_word_char_start,
+                            match_len: word.char_end - *first_word_char_start,
                             ids: ids.clone().collect(),
-                            word_position,
-                            token_position,
+                            position: MatchPosition::Phrase {
+                                word_positions: (first_word_position, word_position),
+                                token_positions: (first_token_position, token_position),
+                            },
                         });
 
                         // the match is complete, we return true.
@@ -191,8 +206,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                         matches.push(Match {
                             match_len: char_len,
                             ids,
-                            word_position,
-                            token_position,
+                            position: MatchPosition::Word { word_position, token_position },
                         });
                         break;
                     }
@@ -228,13 +242,47 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             Some((tokens, matches)) => matches
                 .iter()
                 .map(|m| MatchBounds {
-                    start: tokens[m.token_position].byte_start,
+                    start: tokens[match m.position {
+                        MatchPosition::Word { token_position, .. } => token_position,
+                        MatchPosition::Phrase {
+                            token_positions: (first_token_position, _),
+                            ..
+                        } => first_token_position,
+                    }]
+                    .byte_start,
                     length: m.match_len,
                 })
                 .collect(),
         }
     }
 
+    // @TODO: This should be improved, looks nasty
+    fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize {
+        match m.position {
+            MatchPosition::Word { word_position, token_position } => {
+                if is_word {
+                    word_position
+                } else {
+                    token_position
+                }
+            }
+            MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => {
+                if is_word {
+                    if is_first {
+                        return wpf;
+                    } else {
+                        return wpl;
+                    }
+                }
+                if is_first {
+                    tpf
+                } else {
+                    tpl
+                }
+            }
+        }
+    }
+
     /// Returns the bounds in byte index of the crop window.
     fn crop_bounds(
         &self,
@@ -243,10 +291,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         crop_size: usize,
     ) -> (usize, usize) {
         // if there is no match, we start from the beginning of the string by default.
-        let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
-        let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
-        let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
-        let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
+        let first_match_word_position =
+            matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0);
+        let first_match_token_position =
+            matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0);
+        let last_match_word_position =
+            matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0);
+        let last_match_token_position =
+            matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0);
 
         // matches needs to be counted in the crop len.
         let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
@@ -350,7 +402,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 }
 
                 // compute distance between matches
-                distance_score -= (next_match.word_position - m.word_position).min(7) as i16;
+                distance_score -= (self.get_match_pos(next_match, true, true)
+                    - self.get_match_pos(m, true, true))
+                .min(7) as i16;
             }
 
             ids.extend(m.ids.iter());
@@ -378,7 +432,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 // if next match would make interval gross more than crop_size,
                 // we compare the current interval with the best one,
                 // then we increase `interval_first` until next match can be added.
-                if next_match.word_position - matches[interval_first].word_position >= crop_size {
+                let next_match_word_position = self.get_match_pos(next_match, true, true);
+
+                if next_match_word_position
+                    - self.get_match_pos(&matches[interval_first], false, true)
+                    >= crop_size
+                {
                     let interval_score =
                         self.match_interval_score(&matches[interval_first..=interval_last]);
 
@@ -389,10 +448,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     }
 
                     // advance start of the interval while interval is longer than crop_size.
-                    while next_match.word_position - matches[interval_first].word_position
-                        >= crop_size
-                    {
+                    loop {
                         interval_first += 1;
+
+                        if next_match_word_position
+                            - self.get_match_pos(&matches[interval_first], false, true)
+                            < crop_size
+                        {
+                            break;
+                        }
                     }
                 }
                 interval_last = index;
@@ -441,33 +505,41 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     if format_options.highlight {
                         // insert highlight markers around matches.
                         for m in matches {
-                            let token = &tokens[m.token_position];
+                            let (current_byte_start, current_byte_end) = match m.position {
+                                MatchPosition::Word { token_position, .. } => {
+                                    let token = &tokens[token_position];
+                                    (&token.byte_start, &token.byte_end)
+                                }
+                                MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => {
+                                    (&tokens[ftp].byte_start, &tokens[ltp].byte_end)
+                                }
+                            };
 
                             // skip matches out of the crop window.
-                            if token.byte_start < byte_start || token.byte_end > byte_end {
+                            if *current_byte_start < byte_start || *current_byte_end > byte_end {
                                 continue;
                             }
 
-                            if byte_index < token.byte_start {
-                                formatted.push(&self.text[byte_index..token.byte_start]);
+                            if byte_index < *current_byte_start {
+                                formatted.push(&self.text[byte_index..*current_byte_start]);
                             }
 
-                            let highlight_byte_index = self.text[token.byte_start..]
+                            let highlight_byte_index = self.text[*current_byte_start..]
                                 .char_indices()
                                 .enumerate()
                                 .find(|(i, _)| *i == m.match_len)
-                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
+                                .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start);
 
                             formatted.push(self.highlight_prefix);
-                            formatted.push(&self.text[token.byte_start..highlight_byte_index]);
+                            formatted.push(&self.text[*current_byte_start..highlight_byte_index]);
                             formatted.push(self.highlight_suffix);
 
                             // if it's a prefix highlight, we put the end of the word after the highlight marker.
-                            if highlight_byte_index < token.byte_end {
-                                formatted.push(&self.text[highlight_byte_index..token.byte_end]);
+                            if highlight_byte_index < *current_byte_end {
+                                formatted.push(&self.text[highlight_byte_index..*current_byte_end]);
                             }
 
-                            byte_index = token.byte_start + m.match_len;
+                            byte_index = *current_byte_end;
                         }
                     }
 

From cc6a2aec06ebd6cb7332afb0478affe3e63185af Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:31:07 +0300
Subject: [PATCH 03/21] Improve changes to Matcher

---
 milli/src/search/new/matches/mod.rs | 78 +++++++++++++++--------------
 1 file changed, 41 insertions(+), 37 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 26dd6f6e8..a84b25923 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -93,6 +93,16 @@ impl FormatOptions {
     }
 }
 
+enum FL {
+    First,
+    Last,
+}
+
+enum WT {
+    Word,
+    Token,
+}
+
 #[derive(Clone, Debug)]
 pub enum MatchPosition {
     Word {
@@ -256,28 +266,22 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         }
     }
 
-    // @TODO: This should be improved, looks nasty
-    fn get_match_pos(&self, m: &Match, is_first: bool, is_word: bool) -> usize {
+    fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize {
         match m.position {
-            MatchPosition::Word { word_position, token_position } => {
-                if is_word {
-                    word_position
-                } else {
-                    token_position
-                }
-            }
-            MatchPosition::Phrase { word_positions: (wpf, wpl), token_positions: (tpf, tpl) } => {
-                if is_word {
-                    if is_first {
-                        return wpf;
-                    } else {
-                        return wpl;
-                    }
-                }
-                if is_first {
-                    tpf
-                } else {
-                    tpl
+            MatchPosition::Word { word_position, token_position } => match wt {
+                WT::Word => word_position,
+                WT::Token => token_position,
+            },
+            MatchPosition::Phrase { word_positions: (fwp, lwp), token_positions: (ftp, ltp) } => {
+                match wt {
+                    WT::Word => match fl {
+                        FL::First => fwp,
+                        FL::Last => lwp,
+                    },
+                    WT::Token => match fl {
+                        FL::First => ftp,
+                        FL::Last => ltp,
+                    },
                 }
             }
         }
@@ -292,13 +296,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
     ) -> (usize, usize) {
         // if there is no match, we start from the beginning of the string by default.
         let first_match_word_position =
-            matches.first().map(|m| self.get_match_pos(m, true, true)).unwrap_or(0);
+            matches.first().map(|m| self.get_match_pos(m, WT::Word, FL::First)).unwrap_or(0);
         let first_match_token_position =
-            matches.first().map(|m| self.get_match_pos(m, true, false)).unwrap_or(0);
+            matches.first().map(|m| self.get_match_pos(m, WT::Token, FL::First)).unwrap_or(0);
         let last_match_word_position =
-            matches.last().map(|m| self.get_match_pos(m, false, true)).unwrap_or(0);
+            matches.last().map(|m| self.get_match_pos(m, WT::Word, FL::Last)).unwrap_or(0);
         let last_match_token_position =
-            matches.last().map(|m| self.get_match_pos(m, false, false)).unwrap_or(0);
+            matches.last().map(|m| self.get_match_pos(m, WT::Token, FL::Last)).unwrap_or(0);
 
         // matches needs to be counted in the crop len.
         let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
@@ -401,10 +405,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     order_score += 1;
                 }
 
+                let next_match_first_word_pos = self.get_match_pos(next_match, WT::Word, FL::First);
+                let current_match_first_word_pos = self.get_match_pos(m, WT::Word, FL::First);
+
                 // compute distance between matches
-                distance_score -= (self.get_match_pos(next_match, true, true)
-                    - self.get_match_pos(m, true, true))
-                .min(7) as i16;
+                distance_score -=
+                    (next_match_first_word_pos - current_match_first_word_pos).min(7) as i16;
             }
 
             ids.extend(m.ids.iter());
@@ -432,12 +438,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 // if next match would make interval gross more than crop_size,
                 // we compare the current interval with the best one,
                 // then we increase `interval_first` until next match can be added.
-                let next_match_word_position = self.get_match_pos(next_match, true, true);
+                let next_match_word_pos = self.get_match_pos(next_match, WT::Word, FL::First);
+                let mut interval_first_match_word_pos =
+                    self.get_match_pos(&matches[interval_first], WT::Word, FL::Last);
 
-                if next_match_word_position
-                    - self.get_match_pos(&matches[interval_first], false, true)
-                    >= crop_size
-                {
+                if next_match_word_pos - interval_first_match_word_pos >= crop_size {
                     let interval_score =
                         self.match_interval_score(&matches[interval_first..=interval_last]);
 
@@ -450,11 +455,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     // advance start of the interval while interval is longer than crop_size.
                     loop {
                         interval_first += 1;
+                        interval_first_match_word_pos =
+                            self.get_match_pos(&matches[interval_first], WT::Word, FL::Last);
 
-                        if next_match_word_position
-                            - self.get_match_pos(&matches[interval_first], false, true)
-                            < crop_size
-                        {
+                        if next_match_word_pos - interval_first_match_word_pos < crop_size {
                             break;
                         }
                     }

From 65e3d61a955dd9b0f4b877d17a0b2b0dc087816c Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Fri, 13 Sep 2024 13:35:58 +0300
Subject: [PATCH 04/21] Make use of helper function in one more place

---
 milli/src/search/new/matches/mod.rs | 35 ++++++++++++-----------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index a84b25923..5a4f0b914 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -245,27 +245,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         self
     }
 
-    /// Returns boundaries of the words that match the query.
-    pub fn matches(&mut self) -> Vec<MatchBounds> {
-        match &self.matches {
-            None => self.compute_matches().matches(),
-            Some((tokens, matches)) => matches
-                .iter()
-                .map(|m| MatchBounds {
-                    start: tokens[match m.position {
-                        MatchPosition::Word { token_position, .. } => token_position,
-                        MatchPosition::Phrase {
-                            token_positions: (first_token_position, _),
-                            ..
-                        } => first_token_position,
-                    }]
-                    .byte_start,
-                    length: m.match_len,
-                })
-                .collect(),
-        }
-    }
-
     fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize {
         match m.position {
             MatchPosition::Word { word_position, token_position } => match wt {
@@ -287,6 +266,20 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         }
     }
 
+    /// Returns boundaries of the words that match the query.
+    pub fn matches(&mut self) -> Vec<MatchBounds> {
+        match &self.matches {
+            None => self.compute_matches().matches(),
+            Some((tokens, matches)) => matches
+                .iter()
+                .map(|m| MatchBounds {
+                    start: tokens[self.get_match_pos(m, WT::Token, FL::First)].byte_start,
+                    length: m.match_len,
+                })
+                .collect(),
+        }
+    }
+
     /// Returns the bounds in byte index of the crop window.
     fn crop_bounds(
         &self,

From cab63abc845d87350ab36c07d3999b58eebd0eaa Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Fri, 13 Sep 2024 14:35:28 +0300
Subject: [PATCH 05/21] Improve MatchesPosition enum with an impl

---
 milli/src/search/new/matches/mod.rs | 81 ++++++++++++++---------------
 1 file changed, 40 insertions(+), 41 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 5a4f0b914..ce878a1eb 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -93,16 +93,6 @@ impl FormatOptions {
     }
 }
 
-enum FL {
-    First,
-    Last,
-}
-
-enum WT {
-    Word,
-    Token,
-}
-
 #[derive(Clone, Debug)]
 pub enum MatchPosition {
     Word {
@@ -127,6 +117,36 @@ pub struct Match {
     position: MatchPosition,
 }
 
+impl MatchPosition {
+    fn get_first_word(m: &Match) -> usize {
+        match m.position {
+            MatchPosition::Word { word_position, .. } => word_position,
+            MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp,
+        }
+    }
+
+    fn get_last_word(m: &Match) -> usize {
+        match m.position {
+            MatchPosition::Word { word_position, .. } => word_position,
+            MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp,
+        }
+    }
+
+    fn get_first_token(m: &Match) -> usize {
+        match m.position {
+            MatchPosition::Word { token_position, .. } => token_position,
+            MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp,
+        }
+    }
+
+    fn get_last_token(m: &Match) -> usize {
+        match m.position {
+            MatchPosition::Word { token_position, .. } => token_position,
+            MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp,
+        }
+    }
+}
+
 #[derive(Serialize, Debug, Clone, PartialEq, Eq)]
 pub struct MatchBounds {
     pub start: usize,
@@ -245,27 +265,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         self
     }
 
-    fn get_match_pos(&self, m: &Match, wt: WT, fl: FL) -> usize {
-        match m.position {
-            MatchPosition::Word { word_position, token_position } => match wt {
-                WT::Word => word_position,
-                WT::Token => token_position,
-            },
-            MatchPosition::Phrase { word_positions: (fwp, lwp), token_positions: (ftp, ltp) } => {
-                match wt {
-                    WT::Word => match fl {
-                        FL::First => fwp,
-                        FL::Last => lwp,
-                    },
-                    WT::Token => match fl {
-                        FL::First => ftp,
-                        FL::Last => ltp,
-                    },
-                }
-            }
-        }
-    }
-
     /// Returns boundaries of the words that match the query.
     pub fn matches(&mut self) -> Vec<MatchBounds> {
         match &self.matches {
@@ -273,7 +272,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             Some((tokens, matches)) => matches
                 .iter()
                 .map(|m| MatchBounds {
-                    start: tokens[self.get_match_pos(m, WT::Token, FL::First)].byte_start,
+                    start: tokens[MatchPosition::get_first_token(m)].byte_start,
                     length: m.match_len,
                 })
                 .collect(),
@@ -289,13 +288,13 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
     ) -> (usize, usize) {
         // if there is no match, we start from the beginning of the string by default.
         let first_match_word_position =
-            matches.first().map(|m| self.get_match_pos(m, WT::Word, FL::First)).unwrap_or(0);
+            matches.first().map(|m| MatchPosition::get_first_word(m)).unwrap_or(0);
         let first_match_token_position =
-            matches.first().map(|m| self.get_match_pos(m, WT::Token, FL::First)).unwrap_or(0);
+            matches.first().map(|m| MatchPosition::get_first_token(m)).unwrap_or(0);
         let last_match_word_position =
-            matches.last().map(|m| self.get_match_pos(m, WT::Word, FL::Last)).unwrap_or(0);
+            matches.last().map(|m| MatchPosition::get_last_word(m)).unwrap_or(0);
         let last_match_token_position =
-            matches.last().map(|m| self.get_match_pos(m, WT::Token, FL::Last)).unwrap_or(0);
+            matches.last().map(|m| MatchPosition::get_last_token(m)).unwrap_or(0);
 
         // matches needs to be counted in the crop len.
         let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
@@ -398,8 +397,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     order_score += 1;
                 }
 
-                let next_match_first_word_pos = self.get_match_pos(next_match, WT::Word, FL::First);
-                let current_match_first_word_pos = self.get_match_pos(m, WT::Word, FL::First);
+                let next_match_first_word_pos = MatchPosition::get_first_word(next_match);
+                let current_match_first_word_pos = MatchPosition::get_first_word(m);
 
                 // compute distance between matches
                 distance_score -=
@@ -431,9 +430,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 // if next match would make interval gross more than crop_size,
                 // we compare the current interval with the best one,
                 // then we increase `interval_first` until next match can be added.
-                let next_match_word_pos = self.get_match_pos(next_match, WT::Word, FL::First);
+                let next_match_word_pos = MatchPosition::get_first_word(next_match);
                 let mut interval_first_match_word_pos =
-                    self.get_match_pos(&matches[interval_first], WT::Word, FL::Last);
+                    MatchPosition::get_last_word(&matches[interval_first]);
 
                 if next_match_word_pos - interval_first_match_word_pos >= crop_size {
                     let interval_score =
@@ -449,7 +448,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     loop {
                         interval_first += 1;
                         interval_first_match_word_pos =
-                            self.get_match_pos(&matches[interval_first], WT::Word, FL::Last);
+                            MatchPosition::get_last_word(&matches[interval_first]);
 
                         if next_match_word_pos - interval_first_match_word_pos < crop_size {
                             break;

From a2a16bf846066f422a5e6bd9bcb0009a894dcad0 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Fri, 13 Sep 2024 21:20:06 +0300
Subject: [PATCH 06/21] Move MatchPosition impl to Match, adjust counting score
 for phrases

---
 milli/src/search/new/matches/mod.rs | 66 +++++++++++++++++++----------
 1 file changed, 43 insertions(+), 23 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index ce878a1eb..e63920145 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -117,30 +117,30 @@ pub struct Match {
     position: MatchPosition,
 }
 
-impl MatchPosition {
-    fn get_first_word(m: &Match) -> usize {
-        match m.position {
+impl Match {
+    fn get_first_word_pos(&self) -> usize {
+        match self.position {
             MatchPosition::Word { word_position, .. } => word_position,
             MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp,
         }
     }
 
-    fn get_last_word(m: &Match) -> usize {
-        match m.position {
+    fn get_last_word_pos(&self) -> usize {
+        match self.position {
             MatchPosition::Word { word_position, .. } => word_position,
             MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp,
         }
     }
 
-    fn get_first_token(m: &Match) -> usize {
-        match m.position {
+    fn get_first_token_pos(&self) -> usize {
+        match self.position {
             MatchPosition::Word { token_position, .. } => token_position,
             MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp,
         }
     }
 
-    fn get_last_token(m: &Match) -> usize {
-        match m.position {
+    fn get_last_token_pos(&self) -> usize {
+        match self.position {
             MatchPosition::Word { token_position, .. } => token_position,
             MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp,
         }
@@ -272,7 +272,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             Some((tokens, matches)) => matches
                 .iter()
                 .map(|m| MatchBounds {
-                    start: tokens[MatchPosition::get_first_token(m)].byte_start,
+                    start: tokens[m.get_first_token_pos()].byte_start,
                     length: m.match_len,
                 })
                 .collect(),
@@ -288,13 +288,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
     ) -> (usize, usize) {
         // if there is no match, we start from the beginning of the string by default.
         let first_match_word_position =
-            matches.first().map(|m| MatchPosition::get_first_word(m)).unwrap_or(0);
+            matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0);
         let first_match_token_position =
-            matches.first().map(|m| MatchPosition::get_first_token(m)).unwrap_or(0);
-        let last_match_word_position =
-            matches.last().map(|m| MatchPosition::get_last_word(m)).unwrap_or(0);
-        let last_match_token_position =
-            matches.last().map(|m| MatchPosition::get_last_token(m)).unwrap_or(0);
+            matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0);
+        let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
+        let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);
 
         // matches needs to be counted in the crop len.
         let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
@@ -389,6 +387,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         let mut order_score = 0;
         let mut distance_score = 0;
 
+        // Count score for phrases
+        let tally_phrase_scores =
+            |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| {
+                let words_in_phrase_minus_one = (lwp - fwp) as i16;
+                // will always be ordered, so +1 for each space between words
+                *order_score += words_in_phrase_minus_one;
+                // distance will always be 1, so -1 for each space between words
+                *distance_score -= words_in_phrase_minus_one;
+            };
+
         let mut iter = matches.iter().peekable();
         while let Some(m) = iter.next() {
             if let Some(next_match) = iter.peek() {
@@ -397,12 +405,24 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     order_score += 1;
                 }
 
-                let next_match_first_word_pos = MatchPosition::get_first_word(next_match);
-                let current_match_first_word_pos = MatchPosition::get_first_word(m);
+                let m_last_word_pos = match m.position {
+                    MatchPosition::Word { word_position, .. } => word_position,
+                    MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => {
+                        tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
+                        lwp
+                    }
+                };
+
+                let next_match_first_word_pos = match next_match.position {
+                    MatchPosition::Word { word_position, .. } => word_position,
+                    MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp,
+                };
 
                 // compute distance between matches
-                distance_score -=
-                    (next_match_first_word_pos - current_match_first_word_pos).min(7) as i16;
+                distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
+            } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position {
+                // in case last match is a phrase, count score for its words
+                tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
             }
 
             ids.extend(m.ids.iter());
@@ -430,9 +450,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 // if next match would make interval gross more than crop_size,
                 // we compare the current interval with the best one,
                 // then we increase `interval_first` until next match can be added.
-                let next_match_word_pos = MatchPosition::get_first_word(next_match);
+                let next_match_word_pos = next_match.get_last_word_pos();
                 let mut interval_first_match_word_pos =
-                    MatchPosition::get_last_word(&matches[interval_first]);
+                    matches[interval_first].get_first_word_pos();
 
                 if next_match_word_pos - interval_first_match_word_pos >= crop_size {
                     let interval_score =
@@ -448,7 +468,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     loop {
                         interval_first += 1;
                         interval_first_match_word_pos =
-                            MatchPosition::get_last_word(&matches[interval_first]);
+                            matches[interval_first].get_first_word_pos();
 
                         if next_match_word_pos - interval_first_match_word_pos < crop_size {
                             break;

From 51085206ccab6e8e0098c4cf8b2a3e67e06558a4 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Sat, 14 Sep 2024 10:14:07 +0300
Subject: [PATCH 07/21] Misc adjustments

---
 milli/src/search/new/matches/mod.rs | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index e63920145..414509cd3 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -387,7 +387,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         let mut order_score = 0;
         let mut distance_score = 0;
 
-        // Count score for phrases
+        // count score for phrases
         let tally_phrase_scores =
             |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| {
                 let words_in_phrase_minus_one = (lwp - fwp) as i16;
@@ -450,11 +450,11 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 // if next match would make interval gross more than crop_size,
                 // we compare the current interval with the best one,
                 // then we increase `interval_first` until next match can be added.
-                let next_match_word_pos = next_match.get_last_word_pos();
-                let mut interval_first_match_word_pos =
+                let next_match_last_word_pos = next_match.get_last_word_pos();
+                let mut interval_first_match_first_word_pos =
                     matches[interval_first].get_first_word_pos();
 
-                if next_match_word_pos - interval_first_match_word_pos >= crop_size {
+                if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
                     let interval_score =
                         self.match_interval_score(&matches[interval_first..=interval_last]);
 
@@ -467,10 +467,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     // advance start of the interval while interval is longer than crop_size.
                     loop {
                         interval_first += 1;
-                        interval_first_match_word_pos =
+                        interval_first_match_first_word_pos =
                             matches[interval_first].get_first_word_pos();
 
-                        if next_match_word_pos - interval_first_match_word_pos < crop_size {
+                        if next_match_last_word_pos - interval_first_match_first_word_pos
+                            < crop_size
+                        {
                             break;
                         }
                     }

From 993408d3ba65cbcea9920caeab8b421160a931ac Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Sun, 15 Sep 2024 16:15:09 +0300
Subject: [PATCH 08/21] Change closure to fn

---
 milli/src/search/new/matches/mod.rs | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 414509cd3..df110aff9 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -388,14 +388,18 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         let mut distance_score = 0;
 
         // count score for phrases
-        let tally_phrase_scores =
-            |fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16| {
-                let words_in_phrase_minus_one = (lwp - fwp) as i16;
-                // will always be ordered, so +1 for each space between words
-                *order_score += words_in_phrase_minus_one;
-                // distance will always be 1, so -1 for each space between words
-                *distance_score -= words_in_phrase_minus_one;
-            };
+        fn tally_phrase_scores(
+            fwp: &usize,
+            lwp: &usize,
+            order_score: &mut i16,
+            distance_score: &mut i16,
+        ) {
+            let words_in_phrase_minus_one = (lwp - fwp) as i16;
+            // will always be ordered, so +1 for each space between words
+            *order_score += words_in_phrase_minus_one;
+            // distance will always be 1, so -1 for each space between words
+            *distance_score -= words_in_phrase_minus_one;
+        }
 
         let mut iter = matches.iter().peekable();
         while let Some(m) = iter.next() {

From f7337affd6342ae495d99312862b300e7af461e0 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Tue, 17 Sep 2024 17:31:09 +0300
Subject: [PATCH 09/21] Adjust tests to changes

---
 meilisearch/tests/search/locales.rs | 44 ++++++++++++++---------------
 milli/src/search/new/matches/mod.rs |  2 +-
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/meilisearch/tests/search/locales.rs b/meilisearch/tests/search/locales.rs
index dbc4fcc30..b9e70c5b1 100644
--- a/meilisearch/tests/search/locales.rs
+++ b/meilisearch/tests/search/locales.rs
@@ -400,9 +400,9 @@ async fn force_locales() {
                         ]
                       },
                       "_formatted": {
-                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "name_zh": "<em>进击的巨人</em>",
                         "author_zh": "諫山創",
-                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                         "id": "853",
                         "_vectors": {
                           "manual": [
@@ -447,9 +447,9 @@ async fn force_locales() {
                         ]
                       },
                       "_formatted": {
-                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "name_zh": "<em>进击的巨人</em>",
                         "author_zh": "諫山創",
-                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                         "id": "853",
                         "_vectors": {
                           "manual": [
@@ -524,9 +524,9 @@ async fn force_locales_with_pattern() {
                         ]
                       },
                       "_formatted": {
-                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "name_zh": "<em>进击的巨人</em>",
                         "author_zh": "諫山創",
-                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                         "id": "853",
                         "_vectors": {
                           "manual": [
@@ -571,9 +571,9 @@ async fn force_locales_with_pattern() {
                         ]
                       },
                       "_formatted": {
-                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "name_zh": "<em>进击的巨人</em>",
                         "author_zh": "諫山創",
-                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                         "id": "853",
                         "_vectors": {
                           "manual": [
@@ -689,8 +689,8 @@ async fn force_locales_with_pattern_nested() {
                           "author": "諫山 創"
                         },
                         "document_zh": {
-                          "name": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
-                          "description": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                          "name": "<em>进击的巨人</em>",
+                          "description": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                           "author": "諫山創"
                         },
                         "id": "852",
@@ -788,9 +788,9 @@ async fn force_different_locales_with_pattern() {
                         ]
                       },
                       "_formatted": {
-                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "name_zh": "<em>进击的巨人</em>",
                         "author_zh": "諫山創",
-                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                         "id": "853",
                         "_vectors": {
                           "manual": [
@@ -889,9 +889,9 @@ async fn auto_infer_locales_at_search_with_attributes_to_search_on() {
                             ]
                           },
                           "_formatted": {
-                            "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                            "name_zh": "<em>进击的巨人</em>",
                             "author_zh": "諫山創",
-                            "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                            "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                             "id": "853",
                             "_vectors": {
                               "manual": [
@@ -965,9 +965,9 @@ async fn auto_infer_locales_at_search() {
                         ]
                       },
                       "_formatted": {
-                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "name_zh": "<em>进击的巨人</em>",
                         "author_zh": "諫山創",
-                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                         "id": "853",
                         "_vectors": {
                           "manual": [
@@ -1011,9 +1011,9 @@ async fn auto_infer_locales_at_search() {
                             ]
                           },
                           "_formatted": {
-                            "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                            "name_zh": "<em>进击的巨人</em>",
                             "author_zh": "諫山創",
-                            "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                            "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                             "id": "853",
                             "_vectors": {
                               "manual": [
@@ -1057,9 +1057,9 @@ async fn auto_infer_locales_at_search() {
                         ]
                       },
                       "_formatted": {
-                        "name_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
+                        "name_zh": "<em>进击的巨人</em>",
                         "author_zh": "諫山創",
-                        "description_zh": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                        "description_zh": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                         "id": "853",
                         "_vectors": {
                           "manual": [
@@ -1177,8 +1177,8 @@ async fn force_different_locales_with_pattern_nested() {
                           "author": "諫山 創"
                         },
                         "document_zh": {
-                          "name": "<em>进</em><em>击</em><em>的</em><em>巨人</em>",
-                          "description": "<em>进</em><em>击</em><em>的</em><em>巨人</em>是日本的漫画系列，由諫山 創作画。",
+                          "name": "<em>进击的巨人</em>",
+                          "description": "<em>进击的巨人</em>是日本的漫画系列，由諫山 創作画。",
                           "author": "諫山創"
                         },
                         "id": "852",
diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index df110aff9..09d3db575 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -919,7 +919,7 @@ mod tests {
         // should return 10 words with a marker at the start as well the end, and the highlighted matches.
         insta::assert_snapshot!(
             matcher.format(format_options),
-            @"…had the power to split <em>the world</em> between those who…"
+            @"…the power to split <em>the world</em> between those who embraced…"
         );
 
         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");

From 83113998f99bb6d59bb9e94e9ef3e527f4c93f62 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:35:23 +0300
Subject: [PATCH 10/21] Add more test assertions

---
 milli/src/search/new/matches/mod.rs | 36 +++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 09d3db575..8a84f91bd 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -929,6 +929,42 @@ mod tests {
             matcher.format(format_options),
             @"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
         );
+
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention had the power to split the world\"",
+        );
+        let mut matcher = builder.build(text, None);
+        // should highlight "those" and the phrase "and those".
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"<em>The groundbreaking invention had the power to split the world</em>…"
+        );
+
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention had the power to split the world between\"",
+        );
+        let mut matcher = builder.build(text, None);
+        // should highlight "those" and the phrase "and those".
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"The groundbreaking invention had the power to split the world …"
+        );
+
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"",
+        );
+        let mut matcher = builder.build(text, None);
+        // should highlight "those" and the phrase "and those".
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…between those who <em>embraced progress and those who resisted change</em>…"
+        );
     }
 
     #[test]

From 0ffeea5a5209f1e206720e3cf63d7fe627b8cee0 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Thu, 19 Sep 2024 09:06:40 +0300
Subject: [PATCH 11/21] Remove wrong comments

---
 milli/src/search/new/matches/mod.rs | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 8a84f91bd..26115c39b 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -936,7 +936,6 @@ mod tests {
             "\"The groundbreaking invention had the power to split the world\"",
         );
         let mut matcher = builder.build(text, None);
-        // should highlight "those" and the phrase "and those".
         insta::assert_snapshot!(
             matcher.format(format_options),
             @"<em>The groundbreaking invention had the power to split the world</em>…"
@@ -948,7 +947,6 @@ mod tests {
             "\"The groundbreaking invention had the power to split the world between\"",
         );
         let mut matcher = builder.build(text, None);
-        // should highlight "those" and the phrase "and those".
         insta::assert_snapshot!(
             matcher.format(format_options),
             @"The groundbreaking invention had the power to split the world …"
@@ -960,7 +958,6 @@ mod tests {
             "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"",
         );
         let mut matcher = builder.build(text, None);
-        // should highlight "those" and the phrase "and those".
         insta::assert_snapshot!(
             matcher.format(format_options),
             @"…between those who <em>embraced progress and those who resisted change</em>…"

From d20a39b9599f7962b2a316e45cc126f90a3d8eed Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Fri, 27 Sep 2024 15:44:30 +0300
Subject: [PATCH 12/21] Refactor find_best_match_interval

---
 milli/src/search/new/matches/mod.rs | 154 +++++++++++++++++++---------
 1 file changed, 106 insertions(+), 48 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 26115c39b..bbd39e682 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -442,36 +442,48 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
 
     /// Returns the matches interval where the score computed by match_interval_score is the best.
     fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
+        let matches_len = matches.len();
+
         // we compute the matches interval if we have at least 2 matches.
-        if matches.len() > 1 {
+        if matches_len > 1 {
+            // current interval positions.
+            let mut interval_first = 0;
             // positions of the first and the last match of the best matches interval in `matches`.
             let mut best_interval = (0, 0);
             let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
-            // current interval positions.
-            let mut interval_first = 0;
-            let mut interval_last = 0;
-            for (index, next_match) in matches.iter().enumerate().skip(1) {
+
+            let mut index = 1;
+            while index < matches_len - 1 {
+                let next_match = &matches[index];
+
                 // if next match would make interval gross more than crop_size,
                 // we compare the current interval with the best one,
                 // then we increase `interval_first` until next match can be added.
                 let next_match_last_word_pos = next_match.get_last_word_pos();
-                let mut interval_first_match_first_word_pos =
+                let interval_first_match_first_word_pos =
                     matches[interval_first].get_first_word_pos();
 
+                // if the next match would mean that we pass the crop size window,
+                // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
+                // and calculate a score for it, and check if it's better than our best so far
                 if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
-                    let interval_score =
-                        self.match_interval_score(&matches[interval_first..=interval_last]);
+                    // skip for 1, because it would result in the same as our very first interval score
+                    if index != 1 {
+                        let interval_last = index - 1;
+                        let interval_score =
+                            self.match_interval_score(&matches[interval_first..=interval_last]);
 
-                    // keep interval if it's the best
-                    if interval_score > best_interval_score {
-                        best_interval = (interval_first, interval_last);
-                        best_interval_score = interval_score;
+                        // keep interval if it's the best
+                        if interval_score > best_interval_score {
+                            best_interval = (interval_first, interval_last);
+                            best_interval_score = interval_score;
+                        }
                     }
 
                     // advance start of the interval while interval is longer than crop_size.
                     loop {
                         interval_first += 1;
-                        interval_first_match_first_word_pos =
+                        let interval_first_match_first_word_pos =
                             matches[interval_first].get_first_word_pos();
 
                         if next_match_last_word_pos - interval_first_match_first_word_pos
@@ -481,10 +493,12 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                         }
                     }
                 }
-                interval_last = index;
+
+                index += 1;
             }
 
             // compute the last interval score and compare it to the best one.
+            let interval_last = matches_len - 1;
             let interval_score =
                 self.match_interval_score(&matches[interval_first..=interval_last]);
             if interval_score > best_interval_score {
@@ -914,32 +928,32 @@ mod tests {
 
         let format_options = FormatOptions { highlight: true, crop: Some(10) };
 
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
-        let mut matcher = builder.build(text, None);
-        // should return 10 words with a marker at the start as well the end, and the highlighted matches.
-        insta::assert_snapshot!(
-            matcher.format(format_options),
-            @"…the power to split <em>the world</em> between those who embraced…"
-        );
+        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
+        // let mut matcher = builder.build(text, None);
+        // // should return 10 words with a marker at the start as well the end, and the highlighted matches.
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"…the power to split <em>the world</em> between those who embraced…"
+        // );
 
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
-        let mut matcher = builder.build(text, None);
-        // should highlight "those" and the phrase "and those".
-        insta::assert_snapshot!(
-            matcher.format(format_options),
-            @"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
-        );
+        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
+        // let mut matcher = builder.build(text, None);
+        // // should highlight "those" and the phrase "and those".
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"…groundbreaking invention had the <em>power to</em> split the world between…"
+        // );
 
-        let builder = MatcherBuilder::new_test(
-            &rtxn,
-            &temp_index,
-            "\"The groundbreaking invention had the power to split the world\"",
-        );
-        let mut matcher = builder.build(text, None);
-        insta::assert_snapshot!(
-            matcher.format(format_options),
-            @"<em>The groundbreaking invention had the power to split the world</em>…"
-        );
+        // let builder = MatcherBuilder::new_test(
+        //     &rtxn,
+        //     &temp_index,
+        //     "\"The groundbreaking invention had the power to split the world\"",
+        // );
+        // let mut matcher = builder.build(text, None);
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"<em>The groundbreaking invention had the power to split the world</em>…"
+        // );
 
         let builder = MatcherBuilder::new_test(
             &rtxn,
@@ -952,16 +966,60 @@ mod tests {
             @"The groundbreaking invention had the power to split the world …"
         );
 
-        let builder = MatcherBuilder::new_test(
-            &rtxn,
-            &temp_index,
-            "\"The groundbreaking invention\" \"embraced progress and those who resisted change\"",
-        );
-        let mut matcher = builder.build(text, None);
-        insta::assert_snapshot!(
-            matcher.format(format_options),
-            @"…between those who <em>embraced progress and those who resisted change</em>…"
-        );
+        // let builder = MatcherBuilder::new_test(
+        //     &rtxn,
+        //     &temp_index,
+        //     "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
+        // );
+        // let mut matcher = builder.build(text, None);
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"…between those who <em>embraced progress and those who resisted change</em>…"
+        // );
+
+        // let builder = MatcherBuilder::new_test(
+        //     &rtxn,
+        //     &temp_index,
+        //     "\"The groundbreaking invention\" \"split the world between those\"",
+        // );
+        // let mut matcher = builder.build(text, None);
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"…the power to <em>split the world between those</em> who embraced…"
+        // );
+
+        // let builder = MatcherBuilder::new_test(
+        //     &rtxn,
+        //     &temp_index,
+        //     "\"groundbreaking invention\" \"split the world between\"",
+        // );
+        // let mut matcher = builder.build(text, None);
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
+        // );
+
+        // let builder = MatcherBuilder::new_test(
+        //     &rtxn,
+        //     &temp_index,
+        //     "\"groundbreaking invention\" \"had the power to split the world between those\"",
+        // );
+        // let mut matcher = builder.build(text, None);
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"…invention <em>had the power to split the world between those</em>…"
+        // );
+
+        // let builder = MatcherBuilder::new_test(
+        //     &rtxn,
+        //     &temp_index,
+        //     "\"The groundbreaking invention\" \"had the power to split the world between those\"",
+        // );
+        // let mut matcher = builder.build(text, None);
+        // insta::assert_snapshot!(
+        //     matcher.format(format_options),
+        //     @"…invention <em>had the power to split the world between those</em>…"
+        // );
     }
 
     #[test]

From eabc14c26858d9f0bda89e6fa38f0aa4b0244be8 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Mon, 30 Sep 2024 21:24:41 +0300
Subject: [PATCH 13/21] Refactor, handle more cases for phrases

---
 .../src/search/new/matches/matching_words.rs  |   2 +-
 milli/src/search/new/matches/mod.rs           | 497 ++++++++++--------
 2 files changed, 291 insertions(+), 208 deletions(-)

diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs
index 4ad5c37ec..4deaff6a0 100644
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@@ -181,7 +181,7 @@ impl<'a> PartialMatch<'a> {
         // return a new Partial match allowing the highlighter to continue.
         if is_matching && matching_words.len() > 1 {
             matching_words.remove(0);
-            Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
+            Some(MatchType::Partial(Self { matching_words, ids, char_len }))
         // if there is no remaining word to match in the phrase and the current token is matching,
         // return a Full match.
         } else if is_matching {
diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index bbd39e682..624287f5f 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -1,6 +1,6 @@
 use std::borrow::Cow;
 
-use charabia::{Language, SeparatorKind, Token, Tokenizer};
+use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
 pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PartialMatch, WordId};
 use serde::Serialize;
@@ -145,6 +145,13 @@ impl Match {
             MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp,
         }
     }
+
+    fn get_word_count(&self) -> usize {
+        match self.position {
+            MatchPosition::Word { .. } => 1,
+            MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1,
+        }
+    }
 }
 
 #[derive(Serialize, Debug, Clone, PartialEq, Eq)]
@@ -153,6 +160,27 @@ pub struct MatchBounds {
     pub length: usize,
 }
 
+enum SimpleTokenKind {
+    Separator(SeparatorKind),
+    NotSeparator,
+}
+
+impl SimpleTokenKind {
+    fn get(token: &&Token<'_>) -> Self {
+        match token.kind {
+            TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
+            _ => Self::NotSeparator,
+        }
+    }
+
+    fn is_not_separator(&self) -> bool {
+        match self {
+            SimpleTokenKind::NotSeparator => true,
+            SimpleTokenKind::Separator(_) => false,
+        }
+    }
+}
+
 /// Structure used to analyze a string, compute words that match,
 /// and format the source string, returning a highlighted and cropped sub-string.
 pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
@@ -287,95 +315,130 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         crop_size: usize,
     ) -> (usize, usize) {
         // if there is no match, we start from the beginning of the string by default.
-        let first_match_word_position =
+        let first_match_first_word_position =
             matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0);
-        let first_match_token_position =
+        let first_match_first_token_position =
             matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0);
-        let last_match_word_position = matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
-        let last_match_token_position = matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);
+        let last_match_last_word_position =
+            matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
+        let last_match_last_token_position =
+            matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);
 
-        // matches needs to be counted in the crop len.
-        let mut remaining_words = crop_size + first_match_word_position - last_match_word_position;
+        let matches_window_len =
+            last_match_last_word_position - first_match_first_word_position + 1;
 
-        // create the initial state of the crop window: 2 iterators starting from the matches positions,
-        // a reverse iterator starting from the first match token position and going towards the beginning of the text,
-        let mut before_tokens = tokens[..first_match_token_position].iter().rev().peekable();
-        // an iterator starting from the last match token position and going towards the end of the text.
-        let mut after_tokens = tokens[last_match_token_position..].iter().peekable();
+        if crop_size >= matches_window_len {
+            // matches needs to be counted in the crop len.
+            let mut remaining_words = crop_size - matches_window_len;
 
-        // grows the crop window peeking in both directions
-        // until the window contains the good number of words:
-        while remaining_words > 0 {
-            let before_token = before_tokens.peek().map(|t| t.separator_kind());
-            let after_token = after_tokens.peek().map(|t| t.separator_kind());
+            // create the initial state of the crop window: 2 iterators starting from the matches positions,
+            // a reverse iterator starting from the first match token position and going towards the beginning of the text,
+            let mut before_tokens =
+                tokens[..first_match_first_token_position].iter().rev().peekable();
+            // an iterator starting from the last match token position and going towards the end of the text.
+            let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable();
 
-            match (before_token, after_token) {
-                // we can expand both sides.
-                (Some(before_token), Some(after_token)) => {
-                    match (before_token, after_token) {
-                        // if they are both separators and are the same kind then advance both,
-                        // or expand in the soft separator separator side.
-                        (Some(before_token_kind), Some(after_token_kind)) => {
-                            if before_token_kind == after_token_kind {
-                                before_tokens.next();
+            // grows the crop window peeking in both directions
+            // until the window contains the good number of words:
+            while remaining_words > 0 {
+                let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get);
+                let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get);
 
-                                // this avoid having an ending separator before crop marker.
-                                if remaining_words > 1 {
+                match (before_token_kind, after_token_kind) {
+                    // we can expand both sides.
+                    (Some(before_token_kind), Some(after_token_kind)) => {
+                        match (before_token_kind, after_token_kind) {
+                            // if they are both separators and are the same kind then advance both,
+                            // or expand in the soft separator separator side.
+                            (
+                                SimpleTokenKind::Separator(before_token_separator_kind),
+                                SimpleTokenKind::Separator(after_token_separator_kind),
+                            ) => {
+                                if before_token_separator_kind == after_token_separator_kind {
+                                    before_tokens.next();
+
+                                    // this avoid having an ending separator before crop marker.
+                                    if remaining_words > 1 {
+                                        after_tokens.next();
+                                    }
+                                } else if let SeparatorKind::Hard = before_token_separator_kind {
                                     after_tokens.next();
+                                } else {
+                                    before_tokens.next();
                                 }
-                            } else if before_token_kind == SeparatorKind::Hard {
-                                after_tokens.next();
-                            } else {
-                                before_tokens.next();
                             }
-                        }
-                        // if one of the tokens is a word, we expend in the side of the word.
-                        // left is a word, advance left.
-                        (None, Some(_)) => {
-                            before_tokens.next();
-                            remaining_words -= 1;
-                        }
-                        // right is a word, advance right.
-                        (Some(_), None) => {
-                            after_tokens.next();
-                            remaining_words -= 1;
-                        }
-                        // both are words, advance left then right if remaining_word > 0.
-                        (None, None) => {
-                            before_tokens.next();
-                            remaining_words -= 1;
-
-                            if remaining_words > 0 {
+                            // if one of the tokens is a word, we expend in the side of the word.
+                            // left is a word, advance left.
+                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
+                                before_tokens.next();
+                                remaining_words -= 1;
+                            }
+                            // right is a word, advance right.
+                            (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
                                 after_tokens.next();
                                 remaining_words -= 1;
                             }
+                            // both are words, advance left then right if remaining_word > 0.
+                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
+                                before_tokens.next();
+                                remaining_words -= 1;
+
+                                if remaining_words > 0 {
+                                    after_tokens.next();
+                                    remaining_words -= 1;
+                                }
+                            }
                         }
                     }
-                }
-                // the end of the text is reached, advance left.
-                (Some(before_token), None) => {
-                    before_tokens.next();
-                    if before_token.is_none() {
-                        remaining_words -= 1;
+                    // the end of the text is reached, advance left.
+                    (Some(before_token_kind), None) => {
+                        before_tokens.next();
+                        if let SimpleTokenKind::NotSeparator = before_token_kind {
+                            remaining_words -= 1;
+                        }
                     }
-                }
-                // the start of the text is reached, advance right.
-                (None, Some(after_token)) => {
-                    after_tokens.next();
-                    if after_token.is_none() {
-                        remaining_words -= 1;
+                    // the start of the text is reached, advance right.
+                    (None, Some(after_token_kind)) => {
+                        after_tokens.next();
+                        if let SimpleTokenKind::NotSeparator = after_token_kind {
+                            remaining_words -= 1;
+                        }
                     }
+                    // no more token to add.
+                    (None, None) => break,
                 }
-                // no more token to add.
-                (None, None) => break,
             }
+
+            // finally, keep the byte index of each bound of the crop window.
+            let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
+            let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
+
+            (crop_byte_start, crop_byte_end)
+        } else {
+            // there's one match? and it's longer than the crop window, so we have to advance inward
+            let mut remaining_extra_words = matches_window_len - crop_size;
+            let mut tokens_from_end =
+                tokens[..=last_match_last_token_position].iter().rev().peekable();
+
+            while remaining_extra_words > 0 {
+                let token_from_end_kind =
+                    tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO");
+                if token_from_end_kind.is_not_separator() {
+                    remaining_extra_words -= 1;
+                }
+
+                tokens_from_end.next();
+            }
+
+            let crop_byte_start = if first_match_first_token_position > 0 {
+                &tokens[first_match_first_token_position - 1].byte_end
+            } else {
+                &0
+            };
+            let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO");
+
+            (*crop_byte_start, crop_byte_end)
         }
-
-        // finally, keep the byte index of each bound of the crop window.
-        let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
-        let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
-
-        (crop_byte_start, crop_byte_end)
     }
 
     /// Compute the score of a match interval:
@@ -416,11 +479,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                         lwp
                     }
                 };
-
-                let next_match_first_word_pos = match next_match.position {
-                    MatchPosition::Word { word_position, .. } => word_position,
-                    MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp,
-                };
+                let next_match_first_word_pos = next_match.get_first_word_pos();
 
                 // compute distance between matches
                 distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
@@ -443,72 +502,96 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
     /// Returns the matches interval where the score computed by match_interval_score is the best.
     fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
         let matches_len = matches.len();
+        if matches_len <= 1 {
+            return matches;
+        }
+
+        // positions of the first and the last match of the best matches interval in `matches`.
+        struct BestInterval {
+            interval: (usize, usize),
+            score: (i16, i16, i16),
+        }
+
+        fn save_best_interval(
+            best_interval: &mut Option<BestInterval>,
+            interval_first: usize,
+            interval_last: usize,
+            interval_score: (i16, i16, i16),
+        ) {
+            if let Some(best_interval) = best_interval {
+                if interval_score > best_interval.score {
+                    best_interval.interval = (interval_first, interval_last);
+                    best_interval.score = interval_score;
+                }
+            } else {
+                *best_interval = Some(BestInterval {
+                    interval: (interval_first, interval_last),
+                    score: interval_score,
+                });
+            }
+        }
+
+        let mut best_interval: Option<BestInterval> = None;
 
         // we compute the matches interval if we have at least 2 matches.
-        if matches_len > 1 {
-            // current interval positions.
-            let mut interval_first = 0;
-            // positions of the first and the last match of the best matches interval in `matches`.
-            let mut best_interval = (0, 0);
-            let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
+        // current interval positions.
+        let mut interval_first = 0;
+        let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
 
-            let mut index = 1;
-            while index < matches_len - 1 {
-                let next_match = &matches[index];
+        for (index, next_match) in matches.iter().enumerate() {
+            // if next match would make interval gross more than crop_size,
+            // we compare the current interval with the best one,
+            // then we increase `interval_first` until next match can be added.
+            let next_match_last_word_pos = next_match.get_last_word_pos();
 
-                // if next match would make interval gross more than crop_size,
-                // we compare the current interval with the best one,
-                // then we increase `interval_first` until next match can be added.
-                let next_match_last_word_pos = next_match.get_last_word_pos();
-                let interval_first_match_first_word_pos =
-                    matches[interval_first].get_first_word_pos();
+            // if the next match would mean that we pass the crop size window,
+            // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
+            // and calculate a score for it, and check if it's better than our best so far
+            if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
+                // if index is 0 there is no last viable match
+                if index != 0 {
+                    let interval_last = index - 1;
+                    let interval_score =
+                        self.match_interval_score(&matches[interval_first..=interval_last]);
 
-                // if the next match would mean that we pass the crop size window,
-                // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
-                // and calculate a score for it, and check if it's better than our best so far
-                if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
-                    // skip for 1, because it would result in the same as our very first interval score
-                    if index != 1 {
-                        let interval_last = index - 1;
-                        let interval_score =
-                            self.match_interval_score(&matches[interval_first..=interval_last]);
-
-                        // keep interval if it's the best
-                        if interval_score > best_interval_score {
-                            best_interval = (interval_first, interval_last);
-                            best_interval_score = interval_score;
-                        }
-                    }
-
-                    // advance start of the interval while interval is longer than crop_size.
-                    loop {
-                        interval_first += 1;
-                        let interval_first_match_first_word_pos =
-                            matches[interval_first].get_first_word_pos();
-
-                        if next_match_last_word_pos - interval_first_match_first_word_pos
-                            < crop_size
-                        {
-                            break;
-                        }
-                    }
+                    // keep interval if it's the best
+                    save_best_interval(
+                        &mut best_interval,
+                        interval_first,
+                        interval_last,
+                        interval_score,
+                    );
                 }
 
-                index += 1;
-            }
+                // advance start of the interval while interval is longer than crop_size.
+                loop {
+                    interval_first += 1;
+                    interval_first_match_first_word_pos =
+                        matches[interval_first].get_first_word_pos();
 
-            // compute the last interval score and compare it to the best one.
-            let interval_last = matches_len - 1;
+                    if interval_first_match_first_word_pos > next_match_last_word_pos
+                        || next_match_last_word_pos - interval_first_match_first_word_pos
+                            < crop_size
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+        // compute the last interval score and compare it to the best one.
+        let interval_last = matches_len - 1;
+        // if it's the last match with itself, we need to make sure it's
+        // not a phrase longer than the crop window
+        if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
             let interval_score =
                 self.match_interval_score(&matches[interval_first..=interval_last]);
-            if interval_score > best_interval_score {
-                best_interval = (interval_first, interval_last);
-            }
-
-            &matches[best_interval.0..=best_interval.1]
-        } else {
-            matches
+            save_best_interval(&mut best_interval, interval_first, interval_last, interval_score);
         }
+
+        // if none of the matches fit the criteria above, default to the first one
+        let best_interval = best_interval.map_or((0, 0), |v| v.interval);
+        &matches[best_interval.0..=best_interval.1]
     }
 
     // Returns the formatted version of the original text.
@@ -928,98 +1011,98 @@ mod tests {
 
         let format_options = FormatOptions { highlight: true, crop: Some(10) };
 
-        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
-        // let mut matcher = builder.build(text, None);
-        // // should return 10 words with a marker at the start as well the end, and the highlighted matches.
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…the power to split <em>the world</em> between those who embraced…"
-        // );
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
+        let mut matcher = builder.build(text, None);
+        // should return 10 words with a marker at the start as well the end, and the highlighted matches.
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…the power to split <em>the world</em> between those who embraced…"
+        );
 
-        // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
-        // let mut matcher = builder.build(text, None);
-        // // should highlight "those" and the phrase "and those".
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…groundbreaking invention had the <em>power to</em> split the world between…"
-        // );
-
-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention had the power to split the world\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"<em>The groundbreaking invention had the power to split the world</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
+        let mut matcher = builder.build(text, None);
+        // should highlight "those" and the phrase "and those".
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…groundbreaking invention had the <em>power to</em> split the world between…"
+        );
 
         let builder = MatcherBuilder::new_test(
             &rtxn,
             &temp_index,
-            "\"The groundbreaking invention had the power to split the world between\"",
+            "\"The groundbreaking invention had the power to split the world\"",
         );
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
-            @"The groundbreaking invention had the power to split the world …"
+            @"<em>The groundbreaking invention had the power to split the world</em>…"
         );
 
-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…between those who <em>embraced progress and those who resisted change</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention had the power to split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"The groundbreaking invention had the power to split the world…"
+        );
 
-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention\" \"split the world between those\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…the power to <em>split the world between those</em> who embraced…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…between those who <em>embraced progress and those who resisted change</em>…"
+        );
 
-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"groundbreaking invention\" \"split the world between\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention\" \"split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…the power to <em>split the world between those</em> who embraced…"
+        );
 
-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"groundbreaking invention\" \"had the power to split the world between those\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…invention <em>had the power to split the world between those</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"groundbreaking invention\" \"split the world between\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"
+        );
 
-        // let builder = MatcherBuilder::new_test(
-        //     &rtxn,
-        //     &temp_index,
-        //     "\"The groundbreaking invention\" \"had the power to split the world between those\"",
-        // );
-        // let mut matcher = builder.build(text, None);
-        // insta::assert_snapshot!(
-        //     matcher.format(format_options),
-        //     @"…invention <em>had the power to split the world between those</em>…"
-        // );
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"groundbreaking invention\" \"had the power to split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…invention <em>had the power to split the world between those</em>…"
+        );
+
+        let builder = MatcherBuilder::new_test(
+            &rtxn,
+            &temp_index,
+            "\"The groundbreaking invention\" \"had the power to split the world between those\"",
+        );
+        let mut matcher = builder.build(text, None);
+        insta::assert_snapshot!(
+            matcher.format(format_options),
+            @"…invention <em>had the power to split the world between those</em>…"
+        );
     }
 
     #[test]

From 6d16230f17eb000407adb21dc2f3e9fa49767cc8 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Tue, 1 Oct 2024 17:19:15 +0300
Subject: [PATCH 14/21] Refactor

---
 milli/src/search/new/matches/mod.rs | 327 ++++++++++++++--------------
 1 file changed, 158 insertions(+), 169 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 624287f5f..804b59553 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -181,6 +181,149 @@ impl SimpleTokenKind {
     }
 }
 
+#[derive(PartialEq, PartialOrd)]
+struct MatchIntervalScore(i16, i16, i16);
+
+impl MatchIntervalScore {
+    /// Compute the score of a match interval:
+    /// 1) count unique matches
+    /// 2) calculate distance between matches
+    /// 3) count ordered matches
+    fn new(matches: &[Match]) -> Self {
+        let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
+        let mut order_score = 0;
+        let mut distance_score = 0;
+
+        // count score for phrases
+        fn tally_phrase_scores(
+            fwp: &usize,
+            lwp: &usize,
+            order_score: &mut i16,
+            distance_score: &mut i16,
+        ) {
+            let words_in_phrase_minus_one = (lwp - fwp) as i16;
+            // will always be ordered, so +1 for each space between words
+            *order_score += words_in_phrase_minus_one;
+            // distance will always be 1, so -1 for each space between words
+            *distance_score -= words_in_phrase_minus_one;
+        }
+
+        let mut iter = matches.iter().peekable();
+        while let Some(m) = iter.next() {
+            if let Some(next_match) = iter.peek() {
+                // if matches are ordered
+                if next_match.ids.iter().min() > m.ids.iter().min() {
+                    order_score += 1;
+                }
+
+                let m_last_word_pos = match m.position {
+                    MatchPosition::Word { word_position, .. } => word_position,
+                    MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => {
+                        tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
+                        lwp
+                    }
+                };
+                let next_match_first_word_pos = next_match.get_first_word_pos();
+
+                // compute distance between matches
+                distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
+            } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position {
+                // in case last match is a phrase, count score for its words
+                tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
+            }
+
+            ids.extend(m.ids.iter());
+        }
+
+        ids.sort_unstable();
+        ids.dedup();
+        let uniq_score = ids.len() as i16;
+
+        // rank by unique match count, then by distance between matches, then by ordered match count.
+        Self(uniq_score, distance_score, order_score)
+    }
+}
+
+struct MatchIntervalWithScore {
+    interval: (usize, usize),
+    score: MatchIntervalScore,
+}
+
+impl MatchIntervalWithScore {
+    /// Returns the matches interval where the score computed by match_interval_score is the best.
+    fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] {
+        let matches_len = matches.len();
+        if matches_len <= 1 {
+            return matches;
+        }
+
+        // positions of the first and the last match of the best matches interval in `matches`.
+        let mut best_interval: Option<Self> = None;
+        let mut save_best_interval = |interval_first, interval_last, interval_score| {
+            let is_interval_score_better =
+                &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score);
+            if *is_interval_score_better {
+                best_interval =
+                    Some(Self { interval: (interval_first, interval_last), score: interval_score });
+            }
+        };
+
+        // we compute the matches interval if we have at least 2 matches.
+        // current interval positions.
+        let mut interval_first = 0;
+        let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
+
+        for (index, next_match) in matches.iter().enumerate() {
+            // if next match would make interval gross more than crop_size,
+            // we compare the current interval with the best one,
+            // then we increase `interval_first` until next match can be added.
+            let next_match_last_word_pos = next_match.get_last_word_pos();
+
+            // if the next match would mean that we pass the crop size window,
+            // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
+            // and calculate a score for it, and check if it's better than our best so far
+            if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
+                // if index is 0 there is no last viable match
+                if index != 0 {
+                    let interval_last = index - 1;
+                    let interval_score =
+                        MatchIntervalScore::new(&matches[interval_first..=interval_last]);
+
+                    // keep interval if it's the best
+                    save_best_interval(interval_first, interval_last, interval_score);
+                }
+
+                // advance start of the interval while interval is longer than crop_size.
+                loop {
+                    interval_first += 1;
+                    interval_first_match_first_word_pos =
+                        matches[interval_first].get_first_word_pos();
+
+                    if interval_first_match_first_word_pos > next_match_last_word_pos
+                        || next_match_last_word_pos - interval_first_match_first_word_pos
+                            < crop_size
+                    {
+                        break;
+                    }
+                }
+            }
+        }
+
+        // compute the last interval score and compare it to the best one.
+        let interval_last = matches_len - 1;
+        // if it's the last match with itself, we need to make sure it's
+        // not a phrase longer than the crop window
+        if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
+            let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]);
+            save_best_interval(interval_first, interval_last, interval_score);
+        }
+
+        // if none of the matches fit the criteria above, default to the first one
+        let best_interval = best_interval.map_or((0, 0), |v| v.interval);
+        &matches[best_interval.0..=best_interval.1]
+    }
+}
+
 /// Structure used to analyze a string, compute words that match,
 /// and format the source string, returning a highlighted and cropped sub-string.
 pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
@@ -415,14 +558,16 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
 
             (crop_byte_start, crop_byte_end)
         } else {
-            // there's one match? and it's longer than the crop window, so we have to advance inward
+            // there's one match and it's longer than the crop window, so we have to advance inward
             let mut remaining_extra_words = matches_window_len - crop_size;
             let mut tokens_from_end =
                 tokens[..=last_match_last_token_position].iter().rev().peekable();
 
             while remaining_extra_words > 0 {
-                let token_from_end_kind =
-                    tokens_from_end.peek().map(SimpleTokenKind::get).expect("TODO");
+                let token_from_end_kind = tokens_from_end
+                    .peek()
+                    .map(SimpleTokenKind::get)
+                    .expect("Expected iterator to not reach end");
                 if token_from_end_kind.is_not_separator() {
                     remaining_extra_words -= 1;
                 }
@@ -435,165 +580,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             } else {
                 &0
             };
-            let crop_byte_end = tokens_from_end.next().map(|t| t.byte_start).expect("TODO");
+            let crop_byte_end = tokens_from_end
+                .next()
+                .map(|t| t.byte_start)
+                .expect("Expected iterator to not reach end");
 
             (*crop_byte_start, crop_byte_end)
         }
     }
 
-    /// Compute the score of a match interval:
-    /// 1) count unique matches
-    /// 2) calculate distance between matches
-    /// 3) count ordered matches
-    fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
-        let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
-        let mut order_score = 0;
-        let mut distance_score = 0;
-
-        // count score for phrases
-        fn tally_phrase_scores(
-            fwp: &usize,
-            lwp: &usize,
-            order_score: &mut i16,
-            distance_score: &mut i16,
-        ) {
-            let words_in_phrase_minus_one = (lwp - fwp) as i16;
-            // will always be ordered, so +1 for each space between words
-            *order_score += words_in_phrase_minus_one;
-            // distance will always be 1, so -1 for each space between words
-            *distance_score -= words_in_phrase_minus_one;
-        }
-
-        let mut iter = matches.iter().peekable();
-        while let Some(m) = iter.next() {
-            if let Some(next_match) = iter.peek() {
-                // if matches are ordered
-                if next_match.ids.iter().min() > m.ids.iter().min() {
-                    order_score += 1;
-                }
-
-                let m_last_word_pos = match m.position {
-                    MatchPosition::Word { word_position, .. } => word_position,
-                    MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => {
-                        tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
-                        lwp
-                    }
-                };
-                let next_match_first_word_pos = next_match.get_first_word_pos();
-
-                // compute distance between matches
-                distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
-            } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position {
-                // in case last match is a phrase, count score for its words
-                tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
-            }
-
-            ids.extend(m.ids.iter());
-        }
-
-        ids.sort_unstable();
-        ids.dedup();
-        let uniq_score = ids.len() as i16;
-
-        // rank by unique match count, then by distance between matches, then by ordered match count.
-        (uniq_score, distance_score, order_score)
-    }
-
-    /// Returns the matches interval where the score computed by match_interval_score is the best.
-    fn find_best_match_interval<'a>(&self, matches: &'a [Match], crop_size: usize) -> &'a [Match] {
-        let matches_len = matches.len();
-        if matches_len <= 1 {
-            return matches;
-        }
-
-        // positions of the first and the last match of the best matches interval in `matches`.
-        struct BestInterval {
-            interval: (usize, usize),
-            score: (i16, i16, i16),
-        }
-
-        fn save_best_interval(
-            best_interval: &mut Option<BestInterval>,
-            interval_first: usize,
-            interval_last: usize,
-            interval_score: (i16, i16, i16),
-        ) {
-            if let Some(best_interval) = best_interval {
-                if interval_score > best_interval.score {
-                    best_interval.interval = (interval_first, interval_last);
-                    best_interval.score = interval_score;
-                }
-            } else {
-                *best_interval = Some(BestInterval {
-                    interval: (interval_first, interval_last),
-                    score: interval_score,
-                });
-            }
-        }
-
-        let mut best_interval: Option<BestInterval> = None;
-
-        // we compute the matches interval if we have at least 2 matches.
-        // current interval positions.
-        let mut interval_first = 0;
-        let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
-
-        for (index, next_match) in matches.iter().enumerate() {
-            // if next match would make interval gross more than crop_size,
-            // we compare the current interval with the best one,
-            // then we increase `interval_first` until next match can be added.
-            let next_match_last_word_pos = next_match.get_last_word_pos();
-
-            // if the next match would mean that we pass the crop size window,
-            // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
-            // and calculate a score for it, and check if it's better than our best so far
-            if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
-                // if index is 0 there is no last viable match
-                if index != 0 {
-                    let interval_last = index - 1;
-                    let interval_score =
-                        self.match_interval_score(&matches[interval_first..=interval_last]);
-
-                    // keep interval if it's the best
-                    save_best_interval(
-                        &mut best_interval,
-                        interval_first,
-                        interval_last,
-                        interval_score,
-                    );
-                }
-
-                // advance start of the interval while interval is longer than crop_size.
-                loop {
-                    interval_first += 1;
-                    interval_first_match_first_word_pos =
-                        matches[interval_first].get_first_word_pos();
-
-                    if interval_first_match_first_word_pos > next_match_last_word_pos
-                        || next_match_last_word_pos - interval_first_match_first_word_pos
-                            < crop_size
-                    {
-                        break;
-                    }
-                }
-            }
-        }
-
-        // compute the last interval score and compare it to the best one.
-        let interval_last = matches_len - 1;
-        // if it's the last match with itself, we need to make sure it's
-        // not a phrase longer than the crop window
-        if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
-            let interval_score =
-                self.match_interval_score(&matches[interval_first..=interval_last]);
-            save_best_interval(&mut best_interval, interval_first, interval_last, interval_score);
-        }
-
-        // if none of the matches fit the criteria above, default to the first one
-        let best_interval = best_interval.map_or((0, 0), |v| v.interval);
-        &matches[best_interval.0..=best_interval.1]
-    }
-
     // Returns the formatted version of the original text.
     pub fn format(&mut self, format_options: FormatOptions) -> Cow<'t, str> {
         if !format_options.highlight && format_options.crop.is_none() {
@@ -606,7 +601,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     // crop around the best interval.
                     let (byte_start, byte_end) = match format_options.crop {
                         Some(crop_size) if crop_size > 0 => {
-                            let matches = self.find_best_match_interval(matches, crop_size);
+                            let matches = MatchIntervalWithScore::find_best_match_interval(
+                                matches, crop_size,
+                            );
                             self.crop_bounds(tokens, matches, crop_size)
                         }
                         _ => (0, self.text.len()),
@@ -1046,6 +1043,7 @@ mod tests {
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
+            // @TODO: Should probably highlight it all, even if it didn't fit the whole phrase
             @"The groundbreaking invention had the power to split the world…"
         );
 
@@ -1057,6 +1055,7 @@ mod tests {
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
+            // @TODO: Should probably include end of string in this case?
             @"…between those who <em>embraced progress and those who resisted change</em>…"
         );
 
@@ -1090,17 +1089,7 @@ mod tests {
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
-            @"…invention <em>had the power to split the world between those</em>…"
-        );
-
-        let builder = MatcherBuilder::new_test(
-            &rtxn,
-            &temp_index,
-            "\"The groundbreaking invention\" \"had the power to split the world between those\"",
-        );
-        let mut matcher = builder.build(text, None);
-        insta::assert_snapshot!(
-            matcher.format(format_options),
+            // @TODO: "invention" should be highlighted as well
             @"…invention <em>had the power to split the world between those</em>…"
         );
     }

From d9e4db9983e7017bb13a89f7e28def43069e1a58 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Tue, 1 Oct 2024 17:50:59 +0300
Subject: [PATCH 15/21] Refactor

---
 milli/src/search/new/matches/mod.rs | 38 ++++++++++++-----------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 804b59553..1552de8aa 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -166,19 +166,12 @@ enum SimpleTokenKind {
 }
 
 impl SimpleTokenKind {
-    fn get(token: &&Token<'_>) -> Self {
+    fn new(token: &&Token<'_>) -> Self {
         match token.kind {
             TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
             _ => Self::NotSeparator,
         }
     }
-
-    fn is_not_separator(&self) -> bool {
-        match self {
-            SimpleTokenKind::NotSeparator => true,
-            SimpleTokenKind::Separator(_) => false,
-        }
-    }
 }
 
 #[derive(PartialEq, PartialOrd)]
@@ -259,9 +252,12 @@ impl MatchIntervalWithScore {
 
         // positions of the first and the last match of the best matches interval in `matches`.
         let mut best_interval: Option<Self> = None;
-        let mut save_best_interval = |interval_first, interval_last, interval_score| {
+
+        let mut save_best_interval = |interval_first, interval_last| {
+            let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]);
             let is_interval_score_better =
                 &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score);
+
             if *is_interval_score_better {
                 best_interval =
                     Some(Self { interval: (interval_first, interval_last), score: interval_score });
@@ -286,11 +282,8 @@ impl MatchIntervalWithScore {
                 // if index is 0 there is no last viable match
                 if index != 0 {
                     let interval_last = index - 1;
-                    let interval_score =
-                        MatchIntervalScore::new(&matches[interval_first..=interval_last]);
-
                     // keep interval if it's the best
-                    save_best_interval(interval_first, interval_last, interval_score);
+                    save_best_interval(interval_first, interval_last);
                 }
 
                 // advance start of the interval while interval is longer than crop_size.
@@ -314,8 +307,7 @@ impl MatchIntervalWithScore {
         // if it's the last match with itself, we need to make sure it's
         // not a phrase longer than the crop window
         if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
-            let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]);
-            save_best_interval(interval_first, interval_last, interval_score);
+            save_best_interval(interval_first, interval_last);
         }
 
         // if none of the matches fit the criteria above, default to the first one
@@ -359,6 +351,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     Some(MatchType::Full { ids, .. }) => {
                         // save the token that closes the partial match as a match.
                         matches.push(Match {
+                            // @TODO: Shouldn't this be +1?
                             match_len: word.char_end - *first_word_char_start,
                             ids: ids.clone().collect(),
                             position: MatchPosition::Phrase {
@@ -484,8 +477,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             // grows the crop window peeking in both directions
             // until the window contains the good number of words:
             while remaining_words > 0 {
-                let before_token_kind = before_tokens.peek().map(SimpleTokenKind::get);
-                let after_token_kind = after_tokens.peek().map(SimpleTokenKind::get);
+                let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
+                let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new);
 
                 match (before_token_kind, after_token_kind) {
                     // we can expand both sides.
@@ -504,7 +497,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                                     if remaining_words > 1 {
                                         after_tokens.next();
                                     }
-                                } else if let SeparatorKind::Hard = before_token_separator_kind {
+                                } else if matches!(before_token_separator_kind, SeparatorKind::Hard)
+                                {
                                     after_tokens.next();
                                 } else {
                                     before_tokens.next();
@@ -536,14 +530,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     // the end of the text is reached, advance left.
                     (Some(before_token_kind), None) => {
                         before_tokens.next();
-                        if let SimpleTokenKind::NotSeparator = before_token_kind {
+                        if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
                             remaining_words -= 1;
                         }
                     }
                     // the start of the text is reached, advance right.
                     (None, Some(after_token_kind)) => {
                         after_tokens.next();
-                        if let SimpleTokenKind::NotSeparator = after_token_kind {
+                        if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
                             remaining_words -= 1;
                         }
                     }
@@ -566,9 +560,9 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             while remaining_extra_words > 0 {
                 let token_from_end_kind = tokens_from_end
                     .peek()
-                    .map(SimpleTokenKind::get)
+                    .map(SimpleTokenKind::new)
                     .expect("Expected iterator to not reach end");
-                if token_from_end_kind.is_not_separator() {
+                if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) {
                     remaining_extra_words -= 1;
                 }
 

From 37a9d64c4441bb6a4a199ad018ab4ddb44d4d958 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Tue, 1 Oct 2024 22:52:01 +0300
Subject: [PATCH 16/21] Fix failing test, refactor

---
 milli/src/search/new/matches/mod.rs | 44 ++++++++++++++++++-----------
 1 file changed, 27 insertions(+), 17 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 1552de8aa..ae1264482 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -245,8 +245,7 @@ struct MatchIntervalWithScore {
 impl MatchIntervalWithScore {
     /// Returns the matches interval where the score computed by match_interval_score is the best.
     fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] {
-        let matches_len = matches.len();
-        if matches_len <= 1 {
+        if matches.len() <= 1 {
             return matches;
         }
 
@@ -303,7 +302,7 @@ impl MatchIntervalWithScore {
         }
 
         // compute the last interval score and compare it to the best one.
-        let interval_last = matches_len - 1;
+        let interval_last = matches.len() - 1;
         // if it's the last match with itself, we need to make sure it's
         // not a phrase longer than the crop window
         if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
@@ -451,28 +450,39 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         crop_size: usize,
     ) -> (usize, usize) {
         // if there is no match, we start from the beginning of the string by default.
-        let first_match_first_word_position =
-            matches.first().map(|m| m.get_first_word_pos()).unwrap_or(0);
-        let first_match_first_token_position =
-            matches.first().map(|m| m.get_first_token_pos()).unwrap_or(0);
-        let last_match_last_word_position =
-            matches.last().map(|m| m.get_last_word_pos()).unwrap_or(0);
-        let last_match_last_token_position =
-            matches.last().map(|m| m.get_last_token_pos()).unwrap_or(0);
+        let (matches_size, first_match_first_token_position, last_match_last_token_position) =
+            if !matches.is_empty() {
+                let matches_first = matches.first().unwrap();
+                let matches_last = matches.last().unwrap();
 
-        let matches_window_len =
-            last_match_last_word_position - first_match_first_word_position + 1;
+                (
+                    matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1,
+                    matches_first.get_first_token_pos(),
+                    matches_last.get_last_token_pos(),
+                )
+            } else {
+                (0, 0, 0)
+            };
 
-        if crop_size >= matches_window_len {
+        if crop_size >= matches_size {
             // matches needs to be counted in the crop len.
-            let mut remaining_words = crop_size - matches_window_len;
+            let mut remaining_words = crop_size - matches_size;
+
+            let last_match_last_token_position_plus_one = last_match_last_token_position + 1;
+            let after_tokens_starting_index = if matches_size == 0 {
+                0
+            } else if last_match_last_token_position_plus_one < tokens.len() {
+                last_match_last_token_position_plus_one
+            } else {
+                tokens.len()
+            };
 
             // create the initial state of the crop window: 2 iterators starting from the matches positions,
             // a reverse iterator starting from the first match token position and going towards the beginning of the text,
             let mut before_tokens =
                 tokens[..first_match_first_token_position].iter().rev().peekable();
             // an iterator starting from the last match token position and going towards the end of the text.
-            let mut after_tokens = tokens[last_match_last_token_position + 1..].iter().peekable();
+            let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable();
 
             // grows the crop window peeking in both directions
             // until the window contains the good number of words:
@@ -553,7 +563,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
             (crop_byte_start, crop_byte_end)
         } else {
             // there's one match and it's longer than the crop window, so we have to advance inward
-            let mut remaining_extra_words = matches_window_len - crop_size;
+            let mut remaining_extra_words = matches_size - crop_size;
             let mut tokens_from_end =
                 tokens[..=last_match_last_token_position].iter().rev().peekable();
 

From 40336ce87d46b43123d03cb343b4d3f785001a9c Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Thu, 3 Oct 2024 10:40:14 +0300
Subject: [PATCH 17/21] Fix and refactor crop_bounds

---
 milli/src/search/new/matches/mod.rs | 231 ++++++++++++++--------------
 1 file changed, 113 insertions(+), 118 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index ae1264482..f8d60ef54 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -1,6 +1,7 @@
 use std::borrow::Cow;
 
 use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
+use either::Either;
 pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PartialMatch, WordId};
 use serde::Serialize;
@@ -450,147 +451,141 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         crop_size: usize,
     ) -> (usize, usize) {
         // if there is no match, we start from the beginning of the string by default.
-        let (matches_size, first_match_first_token_position, last_match_last_token_position) =
-            if !matches.is_empty() {
-                let matches_first = matches.first().unwrap();
-                let matches_last = matches.last().unwrap();
+        let (
+            mut remaining_words,
+            is_iterating_forward,
+            before_tokens_starting_index,
+            after_tokens_starting_index,
+        ) = if !matches.is_empty() {
+            let matches_first = matches.first().unwrap();
+            let matches_last = matches.last().unwrap();
 
-                (
-                    matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1,
-                    matches_first.get_first_token_pos(),
-                    matches_last.get_last_token_pos(),
-                )
+            let matches_size =
+                matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
+
+            let is_crop_size_gte_match_size = crop_size >= matches_size;
+            let is_iterating_forward = matches_size == 0 || is_crop_size_gte_match_size;
+
+            let remaining_words = if is_crop_size_gte_match_size {
+                crop_size - matches_size
             } else {
-                (0, 0, 0)
+                // in case matches size is greater than crop size, which implies there's only one match,
+                // we count words backwards, because we have to remove words, as they're extra words outside of
+                // crop window
+                matches_size - crop_size
             };
 
-        if crop_size >= matches_size {
-            // matches needs to be counted in the crop len.
-            let mut remaining_words = crop_size - matches_size;
-
-            let last_match_last_token_position_plus_one = last_match_last_token_position + 1;
             let after_tokens_starting_index = if matches_size == 0 {
                 0
-            } else if last_match_last_token_position_plus_one < tokens.len() {
-                last_match_last_token_position_plus_one
             } else {
-                tokens.len()
+                let last_match_last_token_position_plus_one = matches_last.get_last_token_pos() + 1;
+                if last_match_last_token_position_plus_one < tokens.len() {
+                    last_match_last_token_position_plus_one
+                } else {
+                    // we have matched the end of possible tokens, there's nothing to advance
+                    tokens.len() - 1
+                }
             };
 
-            // create the initial state of the crop window: 2 iterators starting from the matches positions,
-            // a reverse iterator starting from the first match token position and going towards the beginning of the text,
-            let mut before_tokens =
-                tokens[..first_match_first_token_position].iter().rev().peekable();
-            // an iterator starting from the last match token position and going towards the end of the text.
-            let mut after_tokens = tokens[after_tokens_starting_index..].iter().peekable();
+            (
+                remaining_words,
+                is_iterating_forward,
+                if is_iterating_forward { matches_first.get_first_token_pos() } else { 0 },
+                after_tokens_starting_index,
+            )
+        } else {
+            (crop_size, true, 0, 0)
+        };
 
-            // grows the crop window peeking in both directions
-            // until the window contains the good number of words:
-            while remaining_words > 0 {
-                let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
-                let after_token_kind = after_tokens.peek().map(SimpleTokenKind::new);
+        // create the initial state of the crop window: 2 iterators starting from the matches positions,
+        // a reverse iterator starting from the first match token position and going towards the beginning of the text,
+        let mut before_tokens = tokens[..before_tokens_starting_index].iter().rev().peekable();
+        // an iterator ...
+        let mut after_tokens = if is_iterating_forward {
+            // ... starting from the last match token position and going towards the end of the text.
+            Either::Left(tokens[after_tokens_starting_index..].iter().peekable())
+        } else {
+            // ... starting from the last match token position and going towards the start of the text.
+            Either::Right(tokens[..=after_tokens_starting_index].iter().rev().peekable())
+        };
 
-                match (before_token_kind, after_token_kind) {
-                    // we can expand both sides.
-                    (Some(before_token_kind), Some(after_token_kind)) => {
-                        match (before_token_kind, after_token_kind) {
-                            // if they are both separators and are the same kind then advance both,
-                            // or expand in the soft separator separator side.
-                            (
-                                SimpleTokenKind::Separator(before_token_separator_kind),
-                                SimpleTokenKind::Separator(after_token_separator_kind),
-                            ) => {
-                                if before_token_separator_kind == after_token_separator_kind {
-                                    before_tokens.next();
+        // grows the crop window peeking in both directions
+        // until the window contains the good number of words:
+        while remaining_words > 0 {
+            let before_token_kind = before_tokens.peek().map(SimpleTokenKind::new);
+            let after_token_kind =
+                after_tokens.as_mut().either(|v| v.peek(), |v| v.peek()).map(SimpleTokenKind::new);
 
-                                    // this avoid having an ending separator before crop marker.
-                                    if remaining_words > 1 {
-                                        after_tokens.next();
-                                    }
-                                } else if matches!(before_token_separator_kind, SeparatorKind::Hard)
-                                {
-                                    after_tokens.next();
-                                } else {
-                                    before_tokens.next();
-                                }
-                            }
-                            // if one of the tokens is a word, we expend in the side of the word.
-                            // left is a word, advance left.
-                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
+            match (before_token_kind, after_token_kind) {
+                // we can expand both sides.
+                (Some(before_token_kind), Some(after_token_kind)) => {
+                    match (before_token_kind, after_token_kind) {
+                        // if they are both separators and are the same kind then advance both,
+                        // or expand in the soft separator separator side.
+                        (
+                            SimpleTokenKind::Separator(before_token_separator_kind),
+                            SimpleTokenKind::Separator(after_token_separator_kind),
+                        ) => {
+                            if before_token_separator_kind == after_token_separator_kind {
+                                before_tokens.next();
+
+                                // this avoid having an ending separator before crop marker.
+                                if remaining_words > 1 {
+                                    after_tokens.next();
+                                }
+                            } else if matches!(before_token_separator_kind, SeparatorKind::Hard) {
+                                after_tokens.next();
+                            } else {
                                 before_tokens.next();
-                                remaining_words -= 1;
                             }
-                            // right is a word, advance right.
-                            (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
+                        }
+                        // if one of the tokens is a word, we expend in the side of the word.
+                        // left is a word, advance left.
+                        (SimpleTokenKind::NotSeparator, SimpleTokenKind::Separator(_)) => {
+                            before_tokens.next();
+                            remaining_words -= 1;
+                        }
+                        // right is a word, advance right.
+                        (SimpleTokenKind::Separator(_), SimpleTokenKind::NotSeparator) => {
+                            after_tokens.next();
+                            remaining_words -= 1;
+                        }
+                        // both are words, advance left then right if remaining_word > 0.
+                        (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
+                            before_tokens.next();
+                            remaining_words -= 1;
+
+                            if remaining_words > 0 {
                                 after_tokens.next();
                                 remaining_words -= 1;
                             }
-                            // both are words, advance left then right if remaining_word > 0.
-                            (SimpleTokenKind::NotSeparator, SimpleTokenKind::NotSeparator) => {
-                                before_tokens.next();
-                                remaining_words -= 1;
-
-                                if remaining_words > 0 {
-                                    after_tokens.next();
-                                    remaining_words -= 1;
-                                }
-                            }
                         }
                     }
-                    // the end of the text is reached, advance left.
-                    (Some(before_token_kind), None) => {
-                        before_tokens.next();
-                        if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
-                            remaining_words -= 1;
-                        }
-                    }
-                    // the start of the text is reached, advance right.
-                    (None, Some(after_token_kind)) => {
-                        after_tokens.next();
-                        if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
-                            remaining_words -= 1;
-                        }
-                    }
-                    // no more token to add.
-                    (None, None) => break,
                 }
-            }
-
-            // finally, keep the byte index of each bound of the crop window.
-            let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
-            let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
-
-            (crop_byte_start, crop_byte_end)
-        } else {
-            // there's one match and it's longer than the crop window, so we have to advance inward
-            let mut remaining_extra_words = matches_size - crop_size;
-            let mut tokens_from_end =
-                tokens[..=last_match_last_token_position].iter().rev().peekable();
-
-            while remaining_extra_words > 0 {
-                let token_from_end_kind = tokens_from_end
-                    .peek()
-                    .map(SimpleTokenKind::new)
-                    .expect("Expected iterator to not reach end");
-                if matches!(token_from_end_kind, SimpleTokenKind::NotSeparator) {
-                    remaining_extra_words -= 1;
+                // the end of the text is reached, advance left.
+                (Some(before_token_kind), None) => {
+                    before_tokens.next();
+                    if matches!(before_token_kind, SimpleTokenKind::NotSeparator) {
+                        remaining_words -= 1;
+                    }
                 }
-
-                tokens_from_end.next();
+                // the start of the text is reached, advance right.
+                (None, Some(after_token_kind)) => {
+                    after_tokens.next();
+                    if matches!(after_token_kind, SimpleTokenKind::NotSeparator) {
+                        remaining_words -= 1;
+                    }
+                }
+                // no more token to add.
+                (None, None) => break,
             }
-
-            let crop_byte_start = if first_match_first_token_position > 0 {
-                &tokens[first_match_first_token_position - 1].byte_end
-            } else {
-                &0
-            };
-            let crop_byte_end = tokens_from_end
-                .next()
-                .map(|t| t.byte_start)
-                .expect("Expected iterator to not reach end");
-
-            (*crop_byte_start, crop_byte_end)
         }
+
+        // finally, keep the byte index of each bound of the crop window.
+        let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
+        let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
+
+        (crop_byte_start, crop_byte_end)
     }
 
     // Returns the formatted version of the original text.

From 8221c94e7f5666c73944cc5f57211a0eb4035b59 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Thu, 3 Oct 2024 15:37:51 +0300
Subject: [PATCH 18/21] Split into multiple files, refactor

---
 .../search/new/matches/best_match_interval.rs | 139 ++++++++++
 milli/src/search/new/matches/match.rs         |  62 +++++
 milli/src/search/new/matches/mod.rs           | 244 +-----------------
 .../search/new/matches/simple_token_kind.rs   |  15 ++
 4 files changed, 230 insertions(+), 230 deletions(-)
 create mode 100644 milli/src/search/new/matches/best_match_interval.rs
 create mode 100644 milli/src/search/new/matches/match.rs
 create mode 100644 milli/src/search/new/matches/simple_token_kind.rs

diff --git a/milli/src/search/new/matches/best_match_interval.rs b/milli/src/search/new/matches/best_match_interval.rs
new file mode 100644
index 000000000..a6497f351
--- /dev/null
+++ b/milli/src/search/new/matches/best_match_interval.rs
@@ -0,0 +1,139 @@
+use super::matching_words::WordId;
+use super::{Match, MatchPosition};
+
+struct MatchIntervalWithScore {
+    interval: [usize; 2],
+    score: [i16; 3],
+}
+
+// count score for phrases
+fn tally_phrase_scores(fwp: &usize, lwp: &usize, order_score: &mut i16, distance_score: &mut i16) {
+    let words_in_phrase_minus_one = (lwp - fwp) as i16;
+    // will always be ordered, so +1 for each space between words
+    *order_score += words_in_phrase_minus_one;
+    // distance will always be 1, so -1 for each space between words
+    *distance_score -= words_in_phrase_minus_one;
+}
+
+/// Compute the score of a match interval:
+/// 1) count unique matches
+/// 2) calculate distance between matches
+/// 3) count ordered matches
+fn get_interval_score(matches: &[Match]) -> [i16; 3] {
+    let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
+    let mut order_score = 0;
+    let mut distance_score = 0;
+
+    let mut iter = matches.iter().peekable();
+    while let Some(m) = iter.next() {
+        if let Some(next_match) = iter.peek() {
+            // if matches are ordered
+            if next_match.ids.iter().min() > m.ids.iter().min() {
+                order_score += 1;
+            }
+
+            let m_last_word_pos = match m.position {
+                MatchPosition::Word { word_position, .. } => word_position,
+                MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => {
+                    tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
+                    lwp
+                }
+            };
+            let next_match_first_word_pos = next_match.get_first_word_pos();
+
+            // compute distance between matches
+            distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
+        } else if let MatchPosition::Phrase { word_positions: [fwp, lwp], .. } = m.position {
+            // in case last match is a phrase, count score for its words
+            tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
+        }
+
+        ids.extend(m.ids.iter());
+    }
+
+    ids.sort_unstable();
+    ids.dedup();
+    let uniq_score = ids.len() as i16;
+
+    // rank by unique match count, then by distance between matches, then by ordered match count.
+    [uniq_score, distance_score, order_score]
+}
+
+/// Returns the first and last match where the score computed by match_interval_score is the best.
+pub fn find_best_match_interval(matches: &[Match], crop_size: usize) -> [&Match; 2] {
+    if matches.is_empty() {
+        panic!("`matches` should not be empty at this point");
+    }
+
+    // positions of the first and the last match of the best matches interval in `matches`.
+    let mut best_interval: Option<MatchIntervalWithScore> = None;
+
+    let mut save_best_interval = |interval_first, interval_last| {
+        let interval_score = get_interval_score(&matches[interval_first..=interval_last]);
+        let is_interval_score_better = &best_interval
+            .as_ref()
+            .map_or(true, |MatchIntervalWithScore { score, .. }| interval_score > *score);
+
+        if *is_interval_score_better {
+            best_interval = Some(MatchIntervalWithScore {
+                interval: [interval_first, interval_last],
+                score: interval_score,
+            });
+        }
+    };
+
+    // we compute the matches interval if we have at least 2 matches.
+    // current interval positions.
+    let mut interval_first = 0;
+    let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
+
+    for (index, next_match) in matches.iter().enumerate() {
+        // if next match would make interval gross more than crop_size,
+        // we compare the current interval with the best one,
+        // then we increase `interval_first` until next match can be added.
+        let next_match_last_word_pos = next_match.get_last_word_pos();
+
+        // if the next match would mean that we pass the crop size window,
+        // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
+        // and calculate a score for it, and check if it's better than our best so far
+        if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
+            // if index is 0 there is no last viable match
+            if index != 0 {
+                let interval_last = index - 1;
+                // keep interval if it's the best
+                save_best_interval(interval_first, interval_last);
+            }
+
+            // advance start of the interval while interval is longer than crop_size.
+            loop {
+                interval_first += 1;
+                if interval_first == matches.len() {
+                    interval_first -= 1;
+                    break;
+                }
+
+                interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
+
+                if interval_first_match_first_word_pos > next_match_last_word_pos
+                    || next_match_last_word_pos - interval_first_match_first_word_pos < crop_size
+                {
+                    break;
+                }
+            }
+        }
+    }
+
+    // compute the last interval score and compare it to the best one.
+    let interval_last = matches.len() - 1;
+    // if it's the last match with itself, we need to make sure it's
+    // not a phrase longer than the crop window
+    if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
+        save_best_interval(interval_first, interval_last);
+    }
+
+    // if none of the matches fit the criteria above, default to the first one
+    best_interval.map_or(
+        [&matches[0], &matches[0]],
+        |MatchIntervalWithScore { interval: [first, last], .. }| [&matches[first], &matches[last]],
+    )
+}
diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs
new file mode 100644
index 000000000..cc08b006c
--- /dev/null
+++ b/milli/src/search/new/matches/match.rs
@@ -0,0 +1,62 @@
+use super::matching_words::WordId;
+
+#[derive(Clone, Debug)]
+pub enum MatchPosition {
+    Word {
+        // position of the word in the whole text.
+        word_position: usize,
+        // position of the token in the whole text.
+        token_position: usize,
+    },
+    Phrase {
+        // position of the first and last word in the phrase in the whole text.
+        word_positions: [usize; 2],
+        // position of the first and last token in the phrase in the whole text.
+        token_positions: [usize; 2],
+    },
+}
+
+#[derive(Clone, Debug)]
+pub struct Match {
+    pub match_len: usize,
+    // ids of the query words that matches.
+    pub ids: Vec<WordId>,
+    pub position: MatchPosition,
+}
+
+impl Match {
+    pub(super) fn get_first_word_pos(&self) -> usize {
+        match self.position {
+            MatchPosition::Word { word_position, .. } => word_position,
+            MatchPosition::Phrase { word_positions: [fwp, _], .. } => fwp,
+        }
+    }
+
+    pub(super) fn get_last_word_pos(&self) -> usize {
+        match self.position {
+            MatchPosition::Word { word_position, .. } => word_position,
+            MatchPosition::Phrase { word_positions: [_, lwp], .. } => lwp,
+        }
+    }
+
+    pub(super) fn get_first_token_pos(&self) -> usize {
+        match self.position {
+            MatchPosition::Word { token_position, .. } => token_position,
+            MatchPosition::Phrase { token_positions: [ftp, _], .. } => ftp,
+        }
+    }
+
+    pub(super) fn get_last_token_pos(&self) -> usize {
+        match self.position {
+            MatchPosition::Word { token_position, .. } => token_position,
+            MatchPosition::Phrase { token_positions: [_, ltp], .. } => ltp,
+        }
+    }
+
+    pub(super) fn get_word_count(&self) -> usize {
+        match self.position {
+            MatchPosition::Word { .. } => 1,
+            MatchPosition::Phrase { word_positions: [fwp, lwp], .. } => lwp - fwp + 1,
+        }
+    }
+}
diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index f8d60ef54..3df361702 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -1,12 +1,16 @@
-use std::borrow::Cow;
+mod best_match_interval;
+mod r#match;
+mod matching_words;
+mod simple_token_kind;
 
-use charabia::{Language, SeparatorKind, Token, TokenKind, Tokenizer};
+use charabia::{Language, SeparatorKind, Token, Tokenizer};
 use either::Either;
 pub use matching_words::MatchingWords;
-use matching_words::{MatchType, PartialMatch, WordId};
+use matching_words::{MatchType, PartialMatch};
+use r#match::{Match, MatchPosition};
 use serde::Serialize;
-
-pub mod matching_words;
+use simple_token_kind::SimpleTokenKind;
+use std::borrow::Cow;
 
 const DEFAULT_CROP_MARKER: &str = "…";
 const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
@@ -94,228 +98,12 @@ impl FormatOptions {
     }
 }
 
-#[derive(Clone, Debug)]
-pub enum MatchPosition {
-    Word {
-        // position of the word in the whole text.
-        word_position: usize,
-        // position of the token in the whole text.
-        token_position: usize,
-    },
-    Phrase {
-        // position of the first and last word in the phrase in the whole text.
-        word_positions: (usize, usize),
-        // position of the first and last token in the phrase in the whole text.
-        token_positions: (usize, usize),
-    },
-}
-
-#[derive(Clone, Debug)]
-pub struct Match {
-    match_len: usize,
-    // ids of the query words that matches.
-    ids: Vec<WordId>,
-    position: MatchPosition,
-}
-
-impl Match {
-    fn get_first_word_pos(&self) -> usize {
-        match self.position {
-            MatchPosition::Word { word_position, .. } => word_position,
-            MatchPosition::Phrase { word_positions: (fwp, _), .. } => fwp,
-        }
-    }
-
-    fn get_last_word_pos(&self) -> usize {
-        match self.position {
-            MatchPosition::Word { word_position, .. } => word_position,
-            MatchPosition::Phrase { word_positions: (_, lwp), .. } => lwp,
-        }
-    }
-
-    fn get_first_token_pos(&self) -> usize {
-        match self.position {
-            MatchPosition::Word { token_position, .. } => token_position,
-            MatchPosition::Phrase { token_positions: (ftp, _), .. } => ftp,
-        }
-    }
-
-    fn get_last_token_pos(&self) -> usize {
-        match self.position {
-            MatchPosition::Word { token_position, .. } => token_position,
-            MatchPosition::Phrase { token_positions: (_, ltp), .. } => ltp,
-        }
-    }
-
-    fn get_word_count(&self) -> usize {
-        match self.position {
-            MatchPosition::Word { .. } => 1,
-            MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => lwp - fwp + 1,
-        }
-    }
-}
-
 #[derive(Serialize, Debug, Clone, PartialEq, Eq)]
 pub struct MatchBounds {
     pub start: usize,
     pub length: usize,
 }
 
-enum SimpleTokenKind {
-    Separator(SeparatorKind),
-    NotSeparator,
-}
-
-impl SimpleTokenKind {
-    fn new(token: &&Token<'_>) -> Self {
-        match token.kind {
-            TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
-            _ => Self::NotSeparator,
-        }
-    }
-}
-
-#[derive(PartialEq, PartialOrd)]
-struct MatchIntervalScore(i16, i16, i16);
-
-impl MatchIntervalScore {
-    /// Compute the score of a match interval:
-    /// 1) count unique matches
-    /// 2) calculate distance between matches
-    /// 3) count ordered matches
-    fn new(matches: &[Match]) -> Self {
-        let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
-        let mut order_score = 0;
-        let mut distance_score = 0;
-
-        // count score for phrases
-        fn tally_phrase_scores(
-            fwp: &usize,
-            lwp: &usize,
-            order_score: &mut i16,
-            distance_score: &mut i16,
-        ) {
-            let words_in_phrase_minus_one = (lwp - fwp) as i16;
-            // will always be ordered, so +1 for each space between words
-            *order_score += words_in_phrase_minus_one;
-            // distance will always be 1, so -1 for each space between words
-            *distance_score -= words_in_phrase_minus_one;
-        }
-
-        let mut iter = matches.iter().peekable();
-        while let Some(m) = iter.next() {
-            if let Some(next_match) = iter.peek() {
-                // if matches are ordered
-                if next_match.ids.iter().min() > m.ids.iter().min() {
-                    order_score += 1;
-                }
-
-                let m_last_word_pos = match m.position {
-                    MatchPosition::Word { word_position, .. } => word_position,
-                    MatchPosition::Phrase { word_positions: (fwp, lwp), .. } => {
-                        tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
-                        lwp
-                    }
-                };
-                let next_match_first_word_pos = next_match.get_first_word_pos();
-
-                // compute distance between matches
-                distance_score -= (next_match_first_word_pos - m_last_word_pos).min(7) as i16;
-            } else if let MatchPosition::Phrase { word_positions: (fwp, lwp), .. } = m.position {
-                // in case last match is a phrase, count score for its words
-                tally_phrase_scores(&fwp, &lwp, &mut order_score, &mut distance_score);
-            }
-
-            ids.extend(m.ids.iter());
-        }
-
-        ids.sort_unstable();
-        ids.dedup();
-        let uniq_score = ids.len() as i16;
-
-        // rank by unique match count, then by distance between matches, then by ordered match count.
-        Self(uniq_score, distance_score, order_score)
-    }
-}
-
-struct MatchIntervalWithScore {
-    interval: (usize, usize),
-    score: MatchIntervalScore,
-}
-
-impl MatchIntervalWithScore {
-    /// Returns the matches interval where the score computed by match_interval_score is the best.
-    fn find_best_match_interval(matches: &[Match], crop_size: usize) -> &[Match] {
-        if matches.len() <= 1 {
-            return matches;
-        }
-
-        // positions of the first and the last match of the best matches interval in `matches`.
-        let mut best_interval: Option<Self> = None;
-
-        let mut save_best_interval = |interval_first, interval_last| {
-            let interval_score = MatchIntervalScore::new(&matches[interval_first..=interval_last]);
-            let is_interval_score_better =
-                &best_interval.as_ref().map_or(true, |Self { score, .. }| interval_score > *score);
-
-            if *is_interval_score_better {
-                best_interval =
-                    Some(Self { interval: (interval_first, interval_last), score: interval_score });
-            }
-        };
-
-        // we compute the matches interval if we have at least 2 matches.
-        // current interval positions.
-        let mut interval_first = 0;
-        let mut interval_first_match_first_word_pos = matches[interval_first].get_first_word_pos();
-
-        for (index, next_match) in matches.iter().enumerate() {
-            // if next match would make interval gross more than crop_size,
-            // we compare the current interval with the best one,
-            // then we increase `interval_first` until next match can be added.
-            let next_match_last_word_pos = next_match.get_last_word_pos();
-
-            // if the next match would mean that we pass the crop size window,
-            // we take the last valid match, that didn't pass this boundry, which is `index` - 1,
-            // and calculate a score for it, and check if it's better than our best so far
-            if next_match_last_word_pos - interval_first_match_first_word_pos >= crop_size {
-                // if index is 0 there is no last viable match
-                if index != 0 {
-                    let interval_last = index - 1;
-                    // keep interval if it's the best
-                    save_best_interval(interval_first, interval_last);
-                }
-
-                // advance start of the interval while interval is longer than crop_size.
-                loop {
-                    interval_first += 1;
-                    interval_first_match_first_word_pos =
-                        matches[interval_first].get_first_word_pos();
-
-                    if interval_first_match_first_word_pos > next_match_last_word_pos
-                        || next_match_last_word_pos - interval_first_match_first_word_pos
-                            < crop_size
-                    {
-                        break;
-                    }
-                }
-            }
-        }
-
-        // compute the last interval score and compare it to the best one.
-        let interval_last = matches.len() - 1;
-        // if it's the last match with itself, we need to make sure it's
-        // not a phrase longer than the crop window
-        if interval_first != interval_last || matches[interval_first].get_word_count() < crop_size {
-            save_best_interval(interval_first, interval_last);
-        }
-
-        // if none of the matches fit the criteria above, default to the first one
-        let best_interval = best_interval.map_or((0, 0), |v| v.interval);
-        &matches[best_interval.0..=best_interval.1]
-    }
-}
-
 /// Structure used to analyze a string, compute words that match,
 /// and format the source string, returning a highlighted and cropped sub-string.
 pub struct Matcher<'t, 'tokenizer, 'b, 'lang> {
@@ -355,8 +143,8 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                             match_len: word.char_end - *first_word_char_start,
                             ids: ids.clone().collect(),
                             position: MatchPosition::Phrase {
-                                word_positions: (first_word_position, word_position),
-                                token_positions: (first_token_position, token_position),
+                                word_positions: [first_word_position, word_position],
+                                token_positions: [first_token_position, token_position],
                             },
                         });
 
@@ -450,15 +238,14 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         matches: &[Match],
         crop_size: usize,
     ) -> (usize, usize) {
-        // if there is no match, we start from the beginning of the string by default.
         let (
             mut remaining_words,
             is_iterating_forward,
             before_tokens_starting_index,
             after_tokens_starting_index,
         ) = if !matches.is_empty() {
-            let matches_first = matches.first().unwrap();
-            let matches_last = matches.last().unwrap();
+            let [matches_first, matches_last] =
+                best_match_interval::find_best_match_interval(matches, crop_size);
 
             let matches_size =
                 matches_last.get_last_word_pos() - matches_first.get_first_word_pos() + 1;
@@ -600,9 +387,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     // crop around the best interval.
                     let (byte_start, byte_end) = match format_options.crop {
                         Some(crop_size) if crop_size > 0 => {
-                            let matches = MatchIntervalWithScore::find_best_match_interval(
-                                matches, crop_size,
-                            );
                             self.crop_bounds(tokens, matches, crop_size)
                         }
                         _ => (0, self.text.len()),
@@ -625,7 +409,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                                     let token = &tokens[token_position];
                                     (&token.byte_start, &token.byte_end)
                                 }
-                                MatchPosition::Phrase { token_positions: (ftp, ltp), .. } => {
+                                MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
                                     (&tokens[ftp].byte_start, &tokens[ltp].byte_end)
                                 }
                             };
diff --git a/milli/src/search/new/matches/simple_token_kind.rs b/milli/src/search/new/matches/simple_token_kind.rs
new file mode 100644
index 000000000..b34a8c985
--- /dev/null
+++ b/milli/src/search/new/matches/simple_token_kind.rs
@@ -0,0 +1,15 @@
+use charabia::{SeparatorKind, Token, TokenKind};
+
+pub enum SimpleTokenKind {
+    Separator(SeparatorKind),
+    NotSeparator,
+}
+
+impl SimpleTokenKind {
+    pub fn new(token: &&Token<'_>) -> Self {
+        match token.kind {
+            TokenKind::Separator(separaor_kind) => Self::Separator(separaor_kind),
+            _ => Self::NotSeparator,
+        }
+    }
+}

From c3de3a9ab75e6be99314400137b8329cdf46ff12 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Fri, 4 Oct 2024 11:30:31 +0300
Subject: [PATCH 19/21] Refactor

---
 milli/src/search/new/matches/matching_words.rs | 12 +++---------
 milli/src/search/new/matches/mod.rs            |  1 -
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs
index 4deaff6a0..e4d2785ca 100644
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@@ -130,7 +130,7 @@ impl<'a> Iterator for MatchesIter<'a, '_> {
                         word.map(|word| self.matching_words.word_interner.get(word).as_str())
                     })
                     .collect();
-                let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
+                let partial = PartialMatch { matching_words: words, ids };
 
                 partial.match_token(self.token).or_else(|| self.next())
             }
@@ -158,7 +158,6 @@ pub enum MatchType<'a> {
 pub struct PartialMatch<'a> {
     matching_words: Vec<Option<&'a str>>,
     ids: &'a RangeInclusive<WordId>,
-    char_len: usize,
 }
 
 impl<'a> PartialMatch<'a> {
@@ -176,25 +175,20 @@ impl<'a> PartialMatch<'a> {
             None => token.is_stopword(),
         };
 
-        let char_len = token.char_end - token.char_start;
         // if there are remaining words to match in the phrase and the current token is matching,
         // return a new Partial match allowing the highlighter to continue.
         if is_matching && matching_words.len() > 1 {
             matching_words.remove(0);
-            Some(MatchType::Partial(Self { matching_words, ids, char_len }))
+            Some(MatchType::Partial(Self { matching_words, ids }))
         // if there is no remaining word to match in the phrase and the current token is matching,
         // return a Full match.
         } else if is_matching {
-            Some(MatchType::Full { char_len, ids })
+            Some(MatchType::Full { char_len: token.char_end - token.char_start, ids })
         // if the current token doesn't match, return None to break the match sequence.
         } else {
             None
         }
     }
-
-    pub fn char_len(&self) -> usize {
-        self.char_len
-    }
 }
 
 impl fmt::Debug for MatchingWords {
diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 3df361702..9ca560529 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -139,7 +139,6 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     Some(MatchType::Full { ids, .. }) => {
                         // save the token that closes the partial match as a match.
                         matches.push(Match {
-                            // @TODO: Shouldn't this be +1?
                             match_len: word.char_end - *first_word_char_start,
                             ids: ids.clone().collect(),
                             position: MatchPosition::Phrase {

From 03579aba13853560059cec3c881e284b4f7a307a Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Fri, 4 Oct 2024 11:38:47 +0300
Subject: [PATCH 20/21] Adjust test

---
 milli/src/search/new/matches/mod.rs | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 9ca560529..ac0fb7e7b 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -798,12 +798,12 @@ mod tests {
             @"…the power to split <em>the world</em> between those who embraced…"
         );
 
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"power to\" \"and those\"");
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
         let mut matcher = builder.build(text, None);
         // should highlight "those" and the phrase "and those".
         insta::assert_snapshot!(
             matcher.format(format_options),
-            @"…groundbreaking invention had the <em>power to</em> split the world between…"
+            @"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
         );
 
         let builder = MatcherBuilder::new_test(
@@ -841,17 +841,6 @@ mod tests {
             @"…between those who <em>embraced progress and those who resisted change</em>…"
         );
 
-        let builder = MatcherBuilder::new_test(
-            &rtxn,
-            &temp_index,
-            "\"The groundbreaking invention\" \"split the world between those\"",
-        );
-        let mut matcher = builder.build(text, None);
-        insta::assert_snapshot!(
-            matcher.format(format_options),
-            @"…the power to <em>split the world between those</em> who embraced…"
-        );
-
         let builder = MatcherBuilder::new_test(
             &rtxn,
             &temp_index,

From e51e6f902a13525610c4d0a81125c7292da3de36 Mon Sep 17 00:00:00 2001
From: "F. Levi" <55688616+flevi29@users.noreply.github.com>
Date: Sat, 19 Oct 2024 13:42:02 +0300
Subject: [PATCH 21/21] Highlight partially cropped matches too

---
 milli/src/search/new/matches/match.rs         |  2 +-
 .../src/search/new/matches/matching_words.rs  | 25 +++--
 milli/src/search/new/matches/mod.rs           | 94 ++++++++++---------
 3 files changed, 67 insertions(+), 54 deletions(-)

diff --git a/milli/src/search/new/matches/match.rs b/milli/src/search/new/matches/match.rs
index cc08b006c..2eef4d5a6 100644
--- a/milli/src/search/new/matches/match.rs
+++ b/milli/src/search/new/matches/match.rs
@@ -18,7 +18,7 @@ pub enum MatchPosition {
 
 #[derive(Clone, Debug)]
 pub struct Match {
-    pub match_len: usize,
+    pub char_count: usize,
     // ids of the query words that matches.
     pub ids: Vec<WordId>,
     pub position: MatchPosition,
diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs
index e4d2785ca..1f30a17ad 100644
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@@ -86,14 +86,17 @@ impl MatchingWords {
                         continue;
                     };
                     let prefix_length = char_index + c.len_utf8();
-                    let char_len = token.original_lengths(prefix_length).0;
+                    let (char_count, byte_len) = token.original_lengths(prefix_length);
                     let ids = &located_words.positions;
-                    return Some(MatchType::Full { char_len, ids });
+                    return Some(MatchType::Full { ids, char_count, byte_len });
                 // else we exact match the token.
                 } else if token.lemma() == word {
-                    let char_len = token.char_end - token.char_start;
                     let ids = &located_words.positions;
-                    return Some(MatchType::Full { char_len, ids });
+                    return Some(MatchType::Full {
+                        char_count: token.char_end - token.char_start,
+                        byte_len: token.byte_end - token.byte_start,
+                        ids,
+                    });
                 }
             }
         }
@@ -149,7 +152,7 @@ pub type WordId = u16;
 /// In these cases we need to match consecutively several tokens to consider that the match is full.
 #[derive(Debug, PartialEq)]
 pub enum MatchType<'a> {
-    Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
+    Full { char_count: usize, byte_len: usize, ids: &'a RangeInclusive<WordId> },
     Partial(PartialMatch<'a>),
 }
 
@@ -183,7 +186,11 @@ impl<'a> PartialMatch<'a> {
         // if there is no remaining word to match in the phrase and the current token is matching,
         // return a Full match.
         } else if is_matching {
-            Some(MatchType::Full { char_len: token.char_end - token.char_start, ids })
+            Some(MatchType::Full {
+                char_count: token.char_end - token.char_start,
+                byte_len: token.byte_end - token.byte_start,
+                ids,
+            })
         // if the current token doesn't match, return None to break the match sequence.
         } else {
             None
@@ -270,7 +277,7 @@ pub(crate) mod tests {
                     ..Default::default()
                 })
                 .next(),
-            Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
+            Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(0..=0) })
         );
         assert_eq!(
             matching_words
@@ -294,7 +301,7 @@ pub(crate) mod tests {
                     ..Default::default()
                 })
                 .next(),
-            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
+            Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
         );
         assert_eq!(
             matching_words
@@ -306,7 +313,7 @@ pub(crate) mod tests {
                     ..Default::default()
                 })
                 .next(),
-            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
+            Some(MatchType::Full { char_count: 5, byte_len: 5, ids: &(2..=2) })
         );
         assert_eq!(
             matching_words
diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index ac0fb7e7b..80e3ec7b2 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -10,7 +10,10 @@ use matching_words::{MatchType, PartialMatch};
 use r#match::{Match, MatchPosition};
 use serde::Serialize;
 use simple_token_kind::SimpleTokenKind;
-use std::borrow::Cow;
+use std::{
+    borrow::Cow,
+    cmp::{max, min},
+};
 
 const DEFAULT_CROP_MARKER: &str = "…";
 const DEFAULT_HIGHLIGHT_PREFIX: &str = "<em>";
@@ -139,7 +142,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                     Some(MatchType::Full { ids, .. }) => {
                         // save the token that closes the partial match as a match.
                         matches.push(Match {
-                            match_len: word.char_end - *first_word_char_start,
+                            char_count: word.char_end - *first_word_char_start,
                             ids: ids.clone().collect(),
                             position: MatchPosition::Phrase {
                                 word_positions: [first_word_position, word_position],
@@ -182,10 +185,10 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 match match_type {
                     // we match, we save the current token as a match,
                     // then we continue the rest of the tokens.
-                    MatchType::Full { char_len, ids } => {
+                    MatchType::Full { ids, char_count, .. } => {
                         let ids: Vec<_> = ids.clone().collect();
                         matches.push(Match {
-                            match_len: char_len,
+                            char_count,
                             ids,
                             position: MatchPosition::Word { word_position, token_position },
                         });
@@ -224,19 +227,15 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
                 .iter()
                 .map(|m| MatchBounds {
                     start: tokens[m.get_first_token_pos()].byte_start,
-                    length: m.match_len,
+                    // TODO: Why is this in chars, while start is in bytes?
+                    length: m.char_count,
                 })
                 .collect(),
         }
     }
 
     /// Returns the bounds in byte index of the crop window.
-    fn crop_bounds(
-        &self,
-        tokens: &[Token<'_>],
-        matches: &[Match],
-        crop_size: usize,
-    ) -> (usize, usize) {
+    fn crop_bounds(&self, tokens: &[Token<'_>], matches: &[Match], crop_size: usize) -> [usize; 2] {
         let (
             mut remaining_words,
             is_iterating_forward,
@@ -371,7 +370,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         let crop_byte_start = before_tokens.next().map_or(0, |t| t.byte_end);
         let crop_byte_end = after_tokens.next().map_or(self.text.len(), |t| t.byte_start);
 
-        (crop_byte_start, crop_byte_end)
+        [crop_byte_start, crop_byte_end]
     }
 
     // Returns the formatted version of the original text.
@@ -382,78 +381,87 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
         } else {
             match &self.matches {
                 Some((tokens, matches)) => {
-                    // If the text has to be cropped,
-                    // crop around the best interval.
-                    let (byte_start, byte_end) = match format_options.crop {
+                    // If the text has to be cropped, crop around the best interval.
+                    let [crop_byte_start, crop_byte_end] = match format_options.crop {
                         Some(crop_size) if crop_size > 0 => {
                             self.crop_bounds(tokens, matches, crop_size)
                         }
-                        _ => (0, self.text.len()),
+                        _ => [0, self.text.len()],
                     };
 
                     let mut formatted = Vec::new();
 
                     // push crop marker if it's not the start of the text.
-                    if byte_start > 0 && !self.crop_marker.is_empty() {
+                    if crop_byte_start > 0 && !self.crop_marker.is_empty() {
                         formatted.push(self.crop_marker);
                     }
 
-                    let mut byte_index = byte_start;
+                    let mut byte_index = crop_byte_start;
 
                     if format_options.highlight {
                         // insert highlight markers around matches.
                         for m in matches {
-                            let (current_byte_start, current_byte_end) = match m.position {
+                            let [m_byte_start, m_byte_end] = match m.position {
                                 MatchPosition::Word { token_position, .. } => {
                                     let token = &tokens[token_position];
-                                    (&token.byte_start, &token.byte_end)
+                                    [&token.byte_start, &token.byte_end]
                                 }
                                 MatchPosition::Phrase { token_positions: [ftp, ltp], .. } => {
-                                    (&tokens[ftp].byte_start, &tokens[ltp].byte_end)
+                                    [&tokens[ftp].byte_start, &tokens[ltp].byte_end]
                                 }
                             };
 
-                            // skip matches out of the crop window.
-                            if *current_byte_start < byte_start || *current_byte_end > byte_end {
+                            // skip matches out of the crop window
+                            if *m_byte_end < crop_byte_start || *m_byte_start > crop_byte_end {
                                 continue;
                             }
 
-                            if byte_index < *current_byte_start {
-                                formatted.push(&self.text[byte_index..*current_byte_start]);
+                            // adjust start and end to the crop window size
+                            let [m_byte_start, m_byte_end] = [
+                                max(m_byte_start, &crop_byte_start),
+                                min(m_byte_end, &crop_byte_end),
+                            ];
+
+                            // push text that is positioned before our matches
+                            if byte_index < *m_byte_start {
+                                formatted.push(&self.text[byte_index..*m_byte_start]);
                             }
 
-                            let highlight_byte_index = self.text[*current_byte_start..]
-                                .char_indices()
-                                .enumerate()
-                                .find(|(i, _)| *i == m.match_len)
-                                .map_or(*current_byte_end, |(_, (i, _))| i + *current_byte_start);
-
                             formatted.push(self.highlight_prefix);
-                            formatted.push(&self.text[*current_byte_start..highlight_byte_index]);
+
+                            // TODO: This is additional work done, charabia::token::Token byte_len
+                            // should already get us the original byte length, however, that doesn't work as
+                            // it's supposed to, investigate why
+                            let highlight_byte_index = self.text[*m_byte_start..]
+                                .char_indices()
+                                .nth(m.char_count)
+                                .map_or(*m_byte_end, |(i, _)| min(i + *m_byte_start, *m_byte_end));
+                            formatted.push(&self.text[*m_byte_start..highlight_byte_index]);
+
                             formatted.push(self.highlight_suffix);
 
                             // if it's a prefix highlight, we put the end of the word after the highlight marker.
-                            if highlight_byte_index < *current_byte_end {
-                                formatted.push(&self.text[highlight_byte_index..*current_byte_end]);
+                            if highlight_byte_index < *m_byte_end {
+                                formatted.push(&self.text[highlight_byte_index..*m_byte_end]);
                             }
 
-                            byte_index = *current_byte_end;
+                            byte_index = *m_byte_end;
                         }
                     }
 
                     // push the rest of the text between last match and the end of crop.
-                    if byte_index < byte_end {
-                        formatted.push(&self.text[byte_index..byte_end]);
+                    if byte_index < crop_byte_end {
+                        formatted.push(&self.text[byte_index..crop_byte_end]);
                     }
 
                     // push crop marker if it's not the end of the text.
-                    if byte_end < self.text.len() && !self.crop_marker.is_empty() {
+                    if crop_byte_end < self.text.len() && !self.crop_marker.is_empty() {
                         formatted.push(self.crop_marker);
                     }
 
                     if formatted.len() == 1 {
                         // avoid concatenating if there is already 1 slice.
-                        Cow::Borrowed(&self.text[byte_start..byte_end])
+                        Cow::Borrowed(&self.text[crop_byte_start..crop_byte_end])
                     } else {
                         Cow::Owned(formatted.concat())
                     }
@@ -825,8 +833,7 @@ mod tests {
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
-            // @TODO: Should probably highlight it all, even if it didn't fit the whole phrase
-            @"The groundbreaking invention had the power to split the world…"
+            @"<em>The groundbreaking invention had the power to split the world</em>…"
         );
 
         let builder = MatcherBuilder::new_test(
@@ -837,7 +844,7 @@ mod tests {
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
-            // @TODO: Should probably include end of string in this case?
+            // TODO: Should include exclamation mark without crop markers
             @"…between those who <em>embraced progress and those who resisted change</em>…"
         );
 
@@ -860,8 +867,7 @@ mod tests {
         let mut matcher = builder.build(text, None);
         insta::assert_snapshot!(
             matcher.format(format_options),
-            // @TODO: "invention" should be highlighted as well
-            @"…invention <em>had the power to split the world between those</em>…"
+            @"…<em>invention</em> <em>had the power to split the world between those</em>…"
         );
     }