Make some cleaning and add comments

2025-07-01 19:08:29 +02:00 · 2022-04-05 17:35:52 +02:00 · 2022-04-05 17:35:52 +02:00 · fa7d3a37c0
commit fa7d3a37c0
parent 3bb1e35ada
1 changed files with 117 additions and 63 deletions
--- a/milli/src/search/matches/mod.rs
+++ b/milli/src/search/matches/mod.rs
@ -4,6 +4,8 @@ pub use matching_words::MatchingWords;
 use matching_words::{MatchType, PrimitiveWordId};
 use meilisearch_tokenizer::token::{SeparatorKind, Token};
 use crate::search::matches::matching_words::PartialMatch;
 pub mod matching_words;
 const DEFAULT_CROP_SIZE: usize = 10;
@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> {
 }
 impl<'t> Matcher<'t, '_> {
    /// Iterates over tokens and save any of them that matches the query.
    fn compute_matches(&mut self) -> &mut Self {
        fn compute_partial_match(
            mut partial: PartialMatch,
            tokens: &[Token],
            token_position: &mut usize,
            word_position: &mut usize,
            matches: &mut Vec<Match>,
        ) -> bool {
            let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())];
            let mut t_position = 1;
            let mut w_position = 1;
            for token in &tokens[*token_position + 1..] {
                if token.is_separator().is_none() {
                    partial = match partial.match_token(&token) {
                        // token matches the partial match, but the match is not full,
                        // we temporarly save the current token then we try to match the next one.
                        Some(MatchType::Partial(partial)) => {
                            potential_matches.push((
                                *token_position + t_position,
                                *word_position + w_position,
                                partial.char_len(),
                            ));
                            partial
                        }
                        // partial match is now full, we keep this matches and we advance positions
                        Some(MatchType::Full { char_len, ids }) => {
                            // save previously matched tokens as matches.
                            let iter = potential_matches.into_iter().map(
                                |(token_position, word_position, match_len)| Match {
                                    match_len,
                                    ids: ids.to_vec(),
                                    word_position,
                                    token_position,
                                },
                            );
                            matches.extend(iter);
                            // move word and token positions after the end of the match.
                            *word_position += w_position;
                            *token_position += t_position;
                            // save the token that closes the partial match as a match.
                            matches.push(Match {
                                match_len: char_len,
                                ids: ids.to_vec(),
                                word_position: *word_position,
                                token_position: *token_position,
                            });
                            // the match is complete, we return true.
                            return true;
                        }
                        // no match, continue to next match.
                        None => break,
                    };
                    w_position += 1;
                }
                t_position += 1;
            }
            // the match is not complete, we return false.
            false
        }
        let mut matches = Vec::new();
        let mut word_position = 0;
        let mut token_position = 0;
        while let Some(token) = self.tokens.get(token_position) {
            if token.is_separator().is_none() {
-                'matches: for match_type in self.matching_words.match_token(&token) {
+                for match_type in self.matching_words.match_token(&token) {
                    match match_type {
                        // we match, we save the current token as a match,
                        // then we continue the rest of the tokens.
                        MatchType::Full { char_len, ids } => {
                            matches.push(Match {
                                match_len: char_len,
@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> {
                                word_position,
                                token_position,
                            });
                            // stop on the first match
                            break;
                        }
-                        MatchType::Partial(mut partial) => {
+                        // we match partially, iterate over next tokens to check if we can complete the match.
-                            let mut potential_matches =
+                        MatchType::Partial(partial) => {
-                                vec![(token_position, word_position, partial.char_len())];
+                            // if match is completed, we break the matching loop over the current token,
-                            let mut t_position = 1;
+                            // then we continue the rest of the tokens.
-                            let mut w_position = 1;
+                            if compute_partial_match(
-                            'partials: for token in &self.tokens[token_position + 1..] {
+                                partial,
-                                if token.is_separator().is_none() {
+                                &self.tokens,
-                                    partial = match partial.match_token(&token) {
+                                &mut token_position,
-                                        Some(MatchType::Partial(partial)) => {
+                                &mut word_position,
-                                            potential_matches.push((
+                                &mut matches,
-                                                token_position + t_position,
+                            ) {
-                                                word_position + w_position,
+                                break;
                                                partial.char_len(),
                                            ));
                                            partial
                                        }
                                        // partial match is now full, we keep this matches and we advance positions
                                        Some(MatchType::Full { char_len, ids }) => {
                                            let iter = potential_matches.into_iter().map(
                                                |(token_position, word_position, match_len)| {
                                                    Match {
                                                        match_len,
                                                        ids: ids.to_vec(),
                                                        word_position,
                                                        token_position,
                                                    }
                                                },
                                            );
                                            matches.extend(iter);
                                            word_position += w_position;
                                            token_position += t_position;
                                            matches.push(Match {
                                                match_len: char_len,
                                                ids: ids.to_vec(),
                                                word_position,
                                                token_position,
                                            });
                                            break 'matches;
                                        }
                                        // no match, continue to next match.
                                        None => break 'partials,
                                    };
                                    w_position += 1;
                                }
                                t_position += 1;
                            }
                        }
                    }
@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> {
        self
    }
    /// Returns boundaries of the words that match the query.
    pub fn matches(&mut self) -> Vec<MatchBounds> {
        match &self.matches {
            None => self.compute_matches().matches(),
@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> {
        }
    }
    /// Returns token position of the window to crop around.
    fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
        // if there is no match, we start from the beginning of the string by default.
        let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
        let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
        let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
        let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
-        // TODO: buggy if no match and first token is a sepparator
+        // matches needs to be counted in the crop len.
        let mut remaining_words =
            self.crop_size + first_match_word_position - last_match_word_position;
        // if first token is a word, then remove 1 to remaining_words.
        if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
            remaining_words -= 1;
        }
        // we start from matches positions, then we expand the window in both sides.
        let mut first_token_position = first_match_token_position;
        let mut last_token_position = last_match_token_position;
        while remaining_words > 0 {
            match (
                // try to expand left
                first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
                // try to expand right
                last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
            ) {
                // we can expand both sides.
                (Some(ft), Some(lt)) => {
                    match (ft.is_separator(), lt.is_separator()) {
-                        // if they are both separators and are the same kind then advance both
+                        // if they are both separators and are the same kind then advance both,
                        // or expand in the soft separator separator side.
                        (Some(f_kind), Some(s_kind)) => {
                            if f_kind == s_kind {
                                first_token_position -= 1;
@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> {
                                first_token_position -= 1;
                            }
                        }
-                        // left is a word, advance left
+                        // if one of the tokens is a word, we expend in the side of the word.
                        // left is a word, advance left.
                        (None, Some(_)) => {
                            first_token_position -= 1;
                            remaining_words -= 1;
                        }
-                        // right is a word, advance right
+                        // right is a word, advance right.
                        (Some(_), None) => {
                            last_token_position += 1;
                            remaining_words -= 1;
                        }
-                        // both are words, advance left then right if remaining_word > 0
+                        // both are words, advance left then right if remaining_word > 0.
                        (None, None) => {
                            first_token_position -= 1;
                            remaining_words -= 1;
@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> {
        (first_token_position, last_token_position)
    }
    /// Compute the score of a match interval:
    /// 1) count unique matches
    /// 2) calculate distance between matches
    /// 3) count ordered matches
    fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
        let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
        let mut order_score = 0;
@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> {
        (uniq_score, distance_score, order_score)
    }
    /// Returns the matches interval where the score computed by match_interval_score is maximal.
    fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
        // we compute the matches interval if we have at least 2 matches.
        if matches.len() > 1 {
            // positions of the first and the last match of the best matches interval in `matches`.
            let mut best_interval = (0, 0);
            let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
            // current interval positions.
            let mut interval_first = 0;
            let mut interval_last = 0;
            for (index, next_match) in matches.iter().enumerate().skip(1) {
-                // if next match would make interval gross more than crop_size
+                // if next match would make interval gross more than crop_size,
                // we compare the current interval with the best one,
                // then we increase `interval_first` until next match can be added.
                if next_match.word_position - matches[interval_first].word_position
                    >= self.crop_size
                {
@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> {
                        best_interval_score = interval_score;
                    }
-                    // advance start of the interval while interval is longer than crop_size
+                    // advance start of the interval while interval is longer than crop_size.
                    while next_match.word_position - matches[interval_first].word_position
                        >= self.crop_size
                    {
@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> {
                interval_last = index;
            }
            // compute the last interval score and compare it to the best one.
            let interval_score =
                self.match_interval_score(&matches[interval_first..=interval_last]);
            if interval_score > best_interval_score {
@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> {
        }
    }
    /// Returns the bounds in byte index of the crop window.
    fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
        let match_interval = self.find_best_match_interval(matches);
@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> {
        (byte_start, byte_end)
    }
    // Returns the formatted version of the original text.
    pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
        // If 0 it will be considered null and thus not crop the field
        // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
        let crop = crop && self.crop_size > 0;
        if !highlight && !crop {
-            // compute matches is not needed if no highlight or crop is requested.
+            // compute matches is not needed if no highlight nor crop is requested.
            Cow::Borrowed(self.text)
        } else {
            match &self.matches {
@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> {
                                .char_indices()
                                .enumerate()
                                .find(|(i, _)| *i == m.match_len)
-                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start)
+                                .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
                                .min(token.byte_end);
                            formatted.push(self.highlight_prefix);
                            formatted.push(&self.text[token.byte_start..highlight_byte_index]);
                            formatted.push(self.highlight_suffix);
-                            formatted.push(&self.text[highlight_byte_index..token.byte_end]);
+                            // if it's a prefix highlight, we put the end of the word after the highlight marker.
                            if highlight_byte_index < token.byte_end {
                                formatted.push(&self.text[highlight_byte_index..token.byte_end]);
                            }
                            byte_index = token.byte_end;
                        }