Make some cleaning and add comments

This commit is contained in:
ManyTheFish 2022-04-05 17:35:52 +02:00
parent 3bb1e35ada
commit fa7d3a37c0

View File

@ -4,6 +4,8 @@ pub use matching_words::MatchingWords;
use matching_words::{MatchType, PrimitiveWordId}; use matching_words::{MatchType, PrimitiveWordId};
use meilisearch_tokenizer::token::{SeparatorKind, Token}; use meilisearch_tokenizer::token::{SeparatorKind, Token};
use crate::search::matches::matching_words::PartialMatch;
pub mod matching_words; pub mod matching_words;
const DEFAULT_CROP_SIZE: usize = 10; const DEFAULT_CROP_SIZE: usize = 10;
@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> {
} }
impl<'t> Matcher<'t, '_> { impl<'t> Matcher<'t, '_> {
/// Iterates over tokens and save any of them that matches the query.
fn compute_matches(&mut self) -> &mut Self { fn compute_matches(&mut self) -> &mut Self {
fn compute_partial_match(
mut partial: PartialMatch,
tokens: &[Token],
token_position: &mut usize,
word_position: &mut usize,
matches: &mut Vec<Match>,
) -> bool {
let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())];
let mut t_position = 1;
let mut w_position = 1;
for token in &tokens[*token_position + 1..] {
if token.is_separator().is_none() {
partial = match partial.match_token(&token) {
// token matches the partial match, but the match is not full,
// we temporarly save the current token then we try to match the next one.
Some(MatchType::Partial(partial)) => {
potential_matches.push((
*token_position + t_position,
*word_position + w_position,
partial.char_len(),
));
partial
}
// partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { char_len, ids }) => {
// save previously matched tokens as matches.
let iter = potential_matches.into_iter().map(
|(token_position, word_position, match_len)| Match {
match_len,
ids: ids.to_vec(),
word_position,
token_position,
},
);
matches.extend(iter);
// move word and token positions after the end of the match.
*word_position += w_position;
*token_position += t_position;
// save the token that closes the partial match as a match.
matches.push(Match {
match_len: char_len,
ids: ids.to_vec(),
word_position: *word_position,
token_position: *token_position,
});
// the match is complete, we return true.
return true;
}
// no match, continue to next match.
None => break,
};
w_position += 1;
}
t_position += 1;
}
// the match is not complete, we return false.
false
}
let mut matches = Vec::new(); let mut matches = Vec::new();
let mut word_position = 0; let mut word_position = 0;
let mut token_position = 0; let mut token_position = 0;
while let Some(token) = self.tokens.get(token_position) { while let Some(token) = self.tokens.get(token_position) {
if token.is_separator().is_none() { if token.is_separator().is_none() {
'matches: for match_type in self.matching_words.match_token(&token) { for match_type in self.matching_words.match_token(&token) {
match match_type { match match_type {
// we match, we save the current token as a match,
// then we continue the rest of the tokens.
MatchType::Full { char_len, ids } => { MatchType::Full { char_len, ids } => {
matches.push(Match { matches.push(Match {
match_len: char_len, match_len: char_len,
@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> {
word_position, word_position,
token_position, token_position,
}); });
// stop on the first match
break; break;
} }
MatchType::Partial(mut partial) => { // we match partially, iterate over next tokens to check if we can complete the match.
let mut potential_matches = MatchType::Partial(partial) => {
vec![(token_position, word_position, partial.char_len())]; // if match is completed, we break the matching loop over the current token,
let mut t_position = 1; // then we continue the rest of the tokens.
let mut w_position = 1; if compute_partial_match(
'partials: for token in &self.tokens[token_position + 1..] { partial,
if token.is_separator().is_none() { &self.tokens,
partial = match partial.match_token(&token) { &mut token_position,
Some(MatchType::Partial(partial)) => { &mut word_position,
potential_matches.push(( &mut matches,
token_position + t_position, ) {
word_position + w_position, break;
partial.char_len(),
));
partial
}
// partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { char_len, ids }) => {
let iter = potential_matches.into_iter().map(
|(token_position, word_position, match_len)| {
Match {
match_len,
ids: ids.to_vec(),
word_position,
token_position,
}
},
);
matches.extend(iter);
word_position += w_position;
token_position += t_position;
matches.push(Match {
match_len: char_len,
ids: ids.to_vec(),
word_position,
token_position,
});
break 'matches;
}
// no match, continue to next match.
None => break 'partials,
};
w_position += 1;
}
t_position += 1;
} }
} }
} }
@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> {
self self
} }
/// Returns boundaries of the words that match the query.
pub fn matches(&mut self) -> Vec<MatchBounds> { pub fn matches(&mut self) -> Vec<MatchBounds> {
match &self.matches { match &self.matches {
None => self.compute_matches().matches(), None => self.compute_matches().matches(),
@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> {
} }
} }
/// Returns token position of the window to crop around.
fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) { fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
// if there is no match, we start from the beginning of the string by default.
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0); let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0); let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0); let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0); let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
// TODO: buggy if no match and first token is a sepparator // matches needs to be counted in the crop len.
let mut remaining_words = let mut remaining_words =
self.crop_size + first_match_word_position - last_match_word_position; self.crop_size + first_match_word_position - last_match_word_position;
// if first token is a word, then remove 1 to remaining_words. // if first token is a word, then remove 1 to remaining_words.
if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) { if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
remaining_words -= 1; remaining_words -= 1;
} }
// we start from matches positions, then we expand the window in both sides.
let mut first_token_position = first_match_token_position; let mut first_token_position = first_match_token_position;
let mut last_token_position = last_match_token_position; let mut last_token_position = last_match_token_position;
while remaining_words > 0 { while remaining_words > 0 {
match ( match (
// try to expand left
first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)), first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
// try to expand right
last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)), last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
) { ) {
// we can expand both sides.
(Some(ft), Some(lt)) => { (Some(ft), Some(lt)) => {
match (ft.is_separator(), lt.is_separator()) { match (ft.is_separator(), lt.is_separator()) {
// if they are both separators and are the same kind then advance both // if they are both separators and are the same kind then advance both,
// or expand in the soft separator separator side.
(Some(f_kind), Some(s_kind)) => { (Some(f_kind), Some(s_kind)) => {
if f_kind == s_kind { if f_kind == s_kind {
first_token_position -= 1; first_token_position -= 1;
@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> {
first_token_position -= 1; first_token_position -= 1;
} }
} }
// left is a word, advance left // if one of the tokens is a word, we expend in the side of the word.
// left is a word, advance left.
(None, Some(_)) => { (None, Some(_)) => {
first_token_position -= 1; first_token_position -= 1;
remaining_words -= 1; remaining_words -= 1;
} }
// right is a word, advance right // right is a word, advance right.
(Some(_), None) => { (Some(_), None) => {
last_token_position += 1; last_token_position += 1;
remaining_words -= 1; remaining_words -= 1;
} }
// both are words, advance left then right if remaining_word > 0 // both are words, advance left then right if remaining_word > 0.
(None, None) => { (None, None) => {
first_token_position -= 1; first_token_position -= 1;
remaining_words -= 1; remaining_words -= 1;
@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> {
(first_token_position, last_token_position) (first_token_position, last_token_position)
} }
/// Compute the score of a match interval:
/// 1) count unique matches
/// 2) calculate distance between matches
/// 3) count ordered matches
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) { fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len()); let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
let mut order_score = 0; let mut order_score = 0;
@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> {
(uniq_score, distance_score, order_score) (uniq_score, distance_score, order_score)
} }
/// Returns the matches interval where the score computed by match_interval_score is maximal.
fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] { fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
// we compute the matches interval if we have at least 2 matches.
if matches.len() > 1 { if matches.len() > 1 {
// positions of the first and the last match of the best matches interval in `matches`.
let mut best_interval = (0, 0); let mut best_interval = (0, 0);
let mut best_interval_score = self.match_interval_score(&matches[0..=0]); let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
// current interval positions.
let mut interval_first = 0; let mut interval_first = 0;
let mut interval_last = 0; let mut interval_last = 0;
for (index, next_match) in matches.iter().enumerate().skip(1) { for (index, next_match) in matches.iter().enumerate().skip(1) {
// if next match would make interval gross more than crop_size // if next match would make interval gross more than crop_size,
// we compare the current interval with the best one,
// then we increase `interval_first` until next match can be added.
if next_match.word_position - matches[interval_first].word_position if next_match.word_position - matches[interval_first].word_position
>= self.crop_size >= self.crop_size
{ {
@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> {
best_interval_score = interval_score; best_interval_score = interval_score;
} }
// advance start of the interval while interval is longer than crop_size // advance start of the interval while interval is longer than crop_size.
while next_match.word_position - matches[interval_first].word_position while next_match.word_position - matches[interval_first].word_position
>= self.crop_size >= self.crop_size
{ {
@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> {
interval_last = index; interval_last = index;
} }
// compute the last interval score and compare it to the best one.
let interval_score = let interval_score =
self.match_interval_score(&matches[interval_first..=interval_last]); self.match_interval_score(&matches[interval_first..=interval_last]);
if interval_score > best_interval_score { if interval_score > best_interval_score {
@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> {
} }
} }
/// Returns the bounds in byte index of the crop window.
fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) { fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
let match_interval = self.find_best_match_interval(matches); let match_interval = self.find_best_match_interval(matches);
@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> {
(byte_start, byte_end) (byte_start, byte_end)
} }
// Returns the formatted version of the original text.
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> { pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
// If 0 it will be considered null and thus not crop the field // If 0 it will be considered null and thus not crop the field
// https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 // https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let crop = crop && self.crop_size > 0; let crop = crop && self.crop_size > 0;
if !highlight && !crop { if !highlight && !crop {
// compute matches is not needed if no highlight or crop is requested. // compute matches is not needed if no highlight nor crop is requested.
Cow::Borrowed(self.text) Cow::Borrowed(self.text)
} else { } else {
match &self.matches { match &self.matches {
@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> {
.char_indices() .char_indices()
.enumerate() .enumerate()
.find(|(i, _)| *i == m.match_len) .find(|(i, _)| *i == m.match_len)
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start) .map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
.min(token.byte_end);
formatted.push(self.highlight_prefix); formatted.push(self.highlight_prefix);
formatted.push(&self.text[token.byte_start..highlight_byte_index]); formatted.push(&self.text[token.byte_start..highlight_byte_index]);
formatted.push(self.highlight_suffix); formatted.push(self.highlight_suffix);
formatted.push(&self.text[highlight_byte_index..token.byte_end]); // if it's a prefix highlight, we put the end of the word after the highlight marker.
if highlight_byte_index < token.byte_end {
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
}
byte_index = token.byte_end; byte_index = token.byte_end;
} }