From fa7d3a37c0d86d8b3129071889e6bc3e4746a26d Mon Sep 17 00:00:00 2001
From: ManyTheFish <many@meilisearch.com>
Date: Tue, 5 Apr 2022 17:35:52 +0200
Subject: [PATCH] Make some cleaning and add comments

---
 milli/src/search/matches/mod.rs | 180 +++++++++++++++++++++-----------
 1 file changed, 117 insertions(+), 63 deletions(-)
diff --git a/milli/src/search/matches/mod.rs b/milli/src/search/matches/mod.rs
index a99798a9b..993ee1f2b 100644
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@@ -4,6 +4,8 @@ pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PrimitiveWordId};
 use meilisearch_tokenizer::token::{SeparatorKind, Token};
 
+use crate::search::matches::matching_words::PartialMatch;
+
 pub mod matching_words;
 
 const DEFAULT_CROP_SIZE: usize = 10;
@@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> {
 }
 
 impl<'t> Matcher<'t, '_> {
+    /// Iterates over tokens and save any of them that matches the query.
     fn compute_matches(&mut self) -> &mut Self {
+        fn compute_partial_match(
+            mut partial: PartialMatch,
+            tokens: &[Token],
+            token_position: &mut usize,
+            word_position: &mut usize,
+            matches: &mut Vec<Match>,
+        ) -> bool {
+            let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())];
+            let mut t_position = 1;
+            let mut w_position = 1;
+            for token in &tokens[*token_position + 1..] {
+                if token.is_separator().is_none() {
+                    partial = match partial.match_token(&token) {
+                        // token matches the partial match, but the match is not full,
+                        // we temporarly save the current token then we try to match the next one.
+                        Some(MatchType::Partial(partial)) => {
+                            potential_matches.push((
+                                *token_position + t_position,
+                                *word_position + w_position,
+                                partial.char_len(),
+                            ));
+                            partial
+                        }
+                        // partial match is now full, we keep this matches and we advance positions
+                        Some(MatchType::Full { char_len, ids }) => {
+                            // save previously matched tokens as matches.
+                            let iter = potential_matches.into_iter().map(
+                                |(token_position, word_position, match_len)| Match {
+                                    match_len,
+                                    ids: ids.to_vec(),
+                                    word_position,
+                                    token_position,
+                                },
+                            );
+                            matches.extend(iter);
+
+                            // move word and token positions after the end of the match.
+                            *word_position += w_position;
+                            *token_position += t_position;
+
+                            // save the token that closes the partial match as a match.
+                            matches.push(Match {
+                                match_len: char_len,
+                                ids: ids.to_vec(),
+                                word_position: *word_position,
+                                token_position: *token_position,
+                            });
+
+                            // the match is complete, we return true.
+                            return true;
+                        }
+                        // no match, continue to next match.
+                        None => break,
+                    };
+                    w_position += 1;
+                }
+                t_position += 1;
+            }
+
+            // the match is not complete, we return false.
+            false
+        }
+
         let mut matches = Vec::new();
         let mut word_position = 0;
         let mut token_position = 0;
         while let Some(token) = self.tokens.get(token_position) {
             if token.is_separator().is_none() {
-                'matches: for match_type in self.matching_words.match_token(&token) {
+                for match_type in self.matching_words.match_token(&token) {
                     match match_type {
+                        // we match, we save the current token as a match,
+                        // then we continue the rest of the tokens.
                         MatchType::Full { char_len, ids } => {
                             matches.push(Match {
                                 match_len: char_len,
@@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> {
                                 word_position,
                                 token_position,
                             });
-                            // stop on the first match
                             break;
                         }
-                        MatchType::Partial(mut partial) => {
-                            let mut potential_matches =
-                                vec![(token_position, word_position, partial.char_len())];
-                            let mut t_position = 1;
-                            let mut w_position = 1;
-                            'partials: for token in &self.tokens[token_position + 1..] {
-                                if token.is_separator().is_none() {
-                                    partial = match partial.match_token(&token) {
-                                        Some(MatchType::Partial(partial)) => {
-                                            potential_matches.push((
-                                                token_position + t_position,
-                                                word_position + w_position,
-                                                partial.char_len(),
-                                            ));
-                                            partial
-                                        }
-                                        // partial match is now full, we keep this matches and we advance positions
-                                        Some(MatchType::Full { char_len, ids }) => {
-                                            let iter = potential_matches.into_iter().map(
-                                                |(token_position, word_position, match_len)| {
-                                                    Match {
-                                                        match_len,
-                                                        ids: ids.to_vec(),
-                                                        word_position,
-                                                        token_position,
-                                                    }
-                                                },
-                                            );
-
-                                            matches.extend(iter);
-
-                                            word_position += w_position;
-                                            token_position += t_position;
-
-                                            matches.push(Match {
-                                                match_len: char_len,
-                                                ids: ids.to_vec(),
-                                                word_position,
-                                                token_position,
-                                            });
-
-                                            break 'matches;
-                                        }
-                                        // no match, continue to next match.
-                                        None => break 'partials,
-                                    };
-                                    w_position += 1;
-                                }
-                                t_position += 1;
+                        // we match partially, iterate over next tokens to check if we can complete the match.
+                        MatchType::Partial(partial) => {
+                            // if match is completed, we break the matching loop over the current token,
+                            // then we continue the rest of the tokens.
+                            if compute_partial_match(
+                                partial,
+                                &self.tokens,
+                                &mut token_position,
+                                &mut word_position,
+                                &mut matches,
+                            ) {
+                                break;
                             }
                         }
                     }
@@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> {
         self
     }
 
+    /// Returns boundaries of the words that match the query.
     pub fn matches(&mut self) -> Vec<MatchBounds> {
         match &self.matches {
             None => self.compute_matches().matches(),
@@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> {
         }
     }
 
+    /// Returns token position of the window to crop around.
     fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
+        // if there is no match, we start from the beginning of the string by default.
         let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
         let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
         let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
         let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
 
-        // TODO: buggy if no match and first token is a sepparator
+        // matches needs to be counted in the crop len.
         let mut remaining_words =
             self.crop_size + first_match_word_position - last_match_word_position;
         // if first token is a word, then remove 1 to remaining_words.
         if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
             remaining_words -= 1;
         }
+
+        // we start from matches positions, then we expand the window in both sides.
         let mut first_token_position = first_match_token_position;
         let mut last_token_position = last_match_token_position;
-
         while remaining_words > 0 {
             match (
+                // try to expand left
                 first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
+                // try to expand right
                 last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
             ) {
+                // we can expand both sides.
                 (Some(ft), Some(lt)) => {
                     match (ft.is_separator(), lt.is_separator()) {
-                        // if they are both separators and are the same kind then advance both
+                        // if they are both separators and are the same kind then advance both,
+                        // or expand in the soft separator separator side.
                         (Some(f_kind), Some(s_kind)) => {
                             if f_kind == s_kind {
                                 first_token_position -= 1;
@@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> {
                                 first_token_position -= 1;
                             }
                         }
-                        // left is a word, advance left
+                        // if one of the tokens is a word, we expend in the side of the word.
+                        // left is a word, advance left.
                         (None, Some(_)) => {
                             first_token_position -= 1;
                             remaining_words -= 1;
                         }
-                        // right is a word, advance right
+                        // right is a word, advance right.
                         (Some(_), None) => {
                             last_token_position += 1;
                             remaining_words -= 1;
                         }
-                        // both are words, advance left then right if remaining_word > 0
+                        // both are words, advance left then right if remaining_word > 0.
                         (None, None) => {
                             first_token_position -= 1;
                             remaining_words -= 1;
@@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> {
         (first_token_position, last_token_position)
     }
 
+    /// Compute the score of a match interval:
+    /// 1) count unique matches
+    /// 2) calculate distance between matches
+    /// 3) count ordered matches
     fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
         let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
         let mut order_score = 0;
@@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> {
         (uniq_score, distance_score, order_score)
     }
 
+    /// Returns the matches interval where the score computed by match_interval_score is maximal.
     fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
+        // we compute the matches interval if we have at least 2 matches.
         if matches.len() > 1 {
+            // positions of the first and the last match of the best matches interval in `matches`.
             let mut best_interval = (0, 0);
             let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
+            // current interval positions.
             let mut interval_first = 0;
             let mut interval_last = 0;
             for (index, next_match) in matches.iter().enumerate().skip(1) {
-                // if next match would make interval gross more than crop_size
+                // if next match would make interval gross more than crop_size,
+                // we compare the current interval with the best one,
+                // then we increase `interval_first` until next match can be added.
                 if next_match.word_position - matches[interval_first].word_position
                     >= self.crop_size
                 {
@@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> {
                         best_interval_score = interval_score;
                     }
 
-                    // advance start of the interval while interval is longer than crop_size
+                    // advance start of the interval while interval is longer than crop_size.
                     while next_match.word_position - matches[interval_first].word_position
                         >= self.crop_size
                     {
@@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> {
                 interval_last = index;
             }
 
+            // compute the last interval score and compare it to the best one.
             let interval_score =
                 self.match_interval_score(&matches[interval_first..=interval_last]);
             if interval_score > best_interval_score {
@@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> {
         }
     }
 
+    /// Returns the bounds in byte index of the crop window.
     fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
         let match_interval = self.find_best_match_interval(matches);
 
@@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> {
         (byte_start, byte_end)
     }
 
+    // Returns the formatted version of the original text.
     pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
         // If 0 it will be considered null and thus not crop the field
         // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
         let crop = crop && self.crop_size > 0;
         if !highlight && !crop {
-            // compute matches is not needed if no highlight or crop is requested.
+            // compute matches is not needed if no highlight nor crop is requested.
             Cow::Borrowed(self.text)
         } else {
             match &self.matches {
@@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> {
                                 .char_indices()
                                 .enumerate()
                                 .find(|(i, _)| *i == m.match_len)
-                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start)
-                                .min(token.byte_end);
+                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
                             formatted.push(self.highlight_prefix);
                             formatted.push(&self.text[token.byte_start..highlight_byte_index]);
                             formatted.push(self.highlight_suffix);
-                            formatted.push(&self.text[highlight_byte_index..token.byte_end]);
+                            // if it's a prefix highlight, we put the end of the word after the highlight marker.
+                            if highlight_byte_index < token.byte_end {
+                                formatted.push(&self.text[highlight_byte_index..token.byte_end]);
+                            }
 
                             byte_index = token.byte_end;
                         }