mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Make some cleaning and add comments
This commit is contained in:
parent
3bb1e35ada
commit
fa7d3a37c0
@ -4,6 +4,8 @@ pub use matching_words::MatchingWords;
|
|||||||
use matching_words::{MatchType, PrimitiveWordId};
|
use matching_words::{MatchType, PrimitiveWordId};
|
||||||
use meilisearch_tokenizer::token::{SeparatorKind, Token};
|
use meilisearch_tokenizer::token::{SeparatorKind, Token};
|
||||||
|
|
||||||
|
use crate::search::matches::matching_words::PartialMatch;
|
||||||
|
|
||||||
pub mod matching_words;
|
pub mod matching_words;
|
||||||
|
|
||||||
const DEFAULT_CROP_SIZE: usize = 10;
|
const DEFAULT_CROP_SIZE: usize = 10;
|
||||||
@ -106,14 +108,80 @@ pub struct Matcher<'t, 'm> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'t> Matcher<'t, '_> {
|
impl<'t> Matcher<'t, '_> {
|
||||||
|
/// Iterates over tokens and save any of them that matches the query.
|
||||||
fn compute_matches(&mut self) -> &mut Self {
|
fn compute_matches(&mut self) -> &mut Self {
|
||||||
|
fn compute_partial_match(
|
||||||
|
mut partial: PartialMatch,
|
||||||
|
tokens: &[Token],
|
||||||
|
token_position: &mut usize,
|
||||||
|
word_position: &mut usize,
|
||||||
|
matches: &mut Vec<Match>,
|
||||||
|
) -> bool {
|
||||||
|
let mut potential_matches = vec![(*token_position, *word_position, partial.char_len())];
|
||||||
|
let mut t_position = 1;
|
||||||
|
let mut w_position = 1;
|
||||||
|
for token in &tokens[*token_position + 1..] {
|
||||||
|
if token.is_separator().is_none() {
|
||||||
|
partial = match partial.match_token(&token) {
|
||||||
|
// token matches the partial match, but the match is not full,
|
||||||
|
// we temporarly save the current token then we try to match the next one.
|
||||||
|
Some(MatchType::Partial(partial)) => {
|
||||||
|
potential_matches.push((
|
||||||
|
*token_position + t_position,
|
||||||
|
*word_position + w_position,
|
||||||
|
partial.char_len(),
|
||||||
|
));
|
||||||
|
partial
|
||||||
|
}
|
||||||
|
// partial match is now full, we keep this matches and we advance positions
|
||||||
|
Some(MatchType::Full { char_len, ids }) => {
|
||||||
|
// save previously matched tokens as matches.
|
||||||
|
let iter = potential_matches.into_iter().map(
|
||||||
|
|(token_position, word_position, match_len)| Match {
|
||||||
|
match_len,
|
||||||
|
ids: ids.to_vec(),
|
||||||
|
word_position,
|
||||||
|
token_position,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
matches.extend(iter);
|
||||||
|
|
||||||
|
// move word and token positions after the end of the match.
|
||||||
|
*word_position += w_position;
|
||||||
|
*token_position += t_position;
|
||||||
|
|
||||||
|
// save the token that closes the partial match as a match.
|
||||||
|
matches.push(Match {
|
||||||
|
match_len: char_len,
|
||||||
|
ids: ids.to_vec(),
|
||||||
|
word_position: *word_position,
|
||||||
|
token_position: *token_position,
|
||||||
|
});
|
||||||
|
|
||||||
|
// the match is complete, we return true.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
// no match, continue to next match.
|
||||||
|
None => break,
|
||||||
|
};
|
||||||
|
w_position += 1;
|
||||||
|
}
|
||||||
|
t_position += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
// the match is not complete, we return false.
|
||||||
|
false
|
||||||
|
}
|
||||||
|
|
||||||
let mut matches = Vec::new();
|
let mut matches = Vec::new();
|
||||||
let mut word_position = 0;
|
let mut word_position = 0;
|
||||||
let mut token_position = 0;
|
let mut token_position = 0;
|
||||||
while let Some(token) = self.tokens.get(token_position) {
|
while let Some(token) = self.tokens.get(token_position) {
|
||||||
if token.is_separator().is_none() {
|
if token.is_separator().is_none() {
|
||||||
'matches: for match_type in self.matching_words.match_token(&token) {
|
for match_type in self.matching_words.match_token(&token) {
|
||||||
match match_type {
|
match match_type {
|
||||||
|
// we match, we save the current token as a match,
|
||||||
|
// then we continue the rest of the tokens.
|
||||||
MatchType::Full { char_len, ids } => {
|
MatchType::Full { char_len, ids } => {
|
||||||
matches.push(Match {
|
matches.push(Match {
|
||||||
match_len: char_len,
|
match_len: char_len,
|
||||||
@ -121,58 +189,20 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
word_position,
|
word_position,
|
||||||
token_position,
|
token_position,
|
||||||
});
|
});
|
||||||
// stop on the first match
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
MatchType::Partial(mut partial) => {
|
// we match partially, iterate over next tokens to check if we can complete the match.
|
||||||
let mut potential_matches =
|
MatchType::Partial(partial) => {
|
||||||
vec![(token_position, word_position, partial.char_len())];
|
// if match is completed, we break the matching loop over the current token,
|
||||||
let mut t_position = 1;
|
// then we continue the rest of the tokens.
|
||||||
let mut w_position = 1;
|
if compute_partial_match(
|
||||||
'partials: for token in &self.tokens[token_position + 1..] {
|
partial,
|
||||||
if token.is_separator().is_none() {
|
&self.tokens,
|
||||||
partial = match partial.match_token(&token) {
|
&mut token_position,
|
||||||
Some(MatchType::Partial(partial)) => {
|
&mut word_position,
|
||||||
potential_matches.push((
|
&mut matches,
|
||||||
token_position + t_position,
|
) {
|
||||||
word_position + w_position,
|
break;
|
||||||
partial.char_len(),
|
|
||||||
));
|
|
||||||
partial
|
|
||||||
}
|
|
||||||
// partial match is now full, we keep this matches and we advance positions
|
|
||||||
Some(MatchType::Full { char_len, ids }) => {
|
|
||||||
let iter = potential_matches.into_iter().map(
|
|
||||||
|(token_position, word_position, match_len)| {
|
|
||||||
Match {
|
|
||||||
match_len,
|
|
||||||
ids: ids.to_vec(),
|
|
||||||
word_position,
|
|
||||||
token_position,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
matches.extend(iter);
|
|
||||||
|
|
||||||
word_position += w_position;
|
|
||||||
token_position += t_position;
|
|
||||||
|
|
||||||
matches.push(Match {
|
|
||||||
match_len: char_len,
|
|
||||||
ids: ids.to_vec(),
|
|
||||||
word_position,
|
|
||||||
token_position,
|
|
||||||
});
|
|
||||||
|
|
||||||
break 'matches;
|
|
||||||
}
|
|
||||||
// no match, continue to next match.
|
|
||||||
None => break 'partials,
|
|
||||||
};
|
|
||||||
w_position += 1;
|
|
||||||
}
|
|
||||||
t_position += 1;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -186,6 +216,7 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns boundaries of the words that match the query.
|
||||||
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
pub fn matches(&mut self) -> Vec<MatchBounds> {
|
||||||
match &self.matches {
|
match &self.matches {
|
||||||
None => self.compute_matches().matches(),
|
None => self.compute_matches().matches(),
|
||||||
@ -199,30 +230,37 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns token position of the window to crop around.
|
||||||
fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
|
fn token_crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
|
||||||
|
// if there is no match, we start from the beginning of the string by default.
|
||||||
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
|
let first_match_word_position = matches.first().map(|m| m.word_position).unwrap_or(0);
|
||||||
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
|
let first_match_token_position = matches.first().map(|m| m.token_position).unwrap_or(0);
|
||||||
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
|
let last_match_word_position = matches.last().map(|m| m.word_position).unwrap_or(0);
|
||||||
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
|
let last_match_token_position = matches.last().map(|m| m.token_position).unwrap_or(0);
|
||||||
|
|
||||||
// TODO: buggy if no match and first token is a sepparator
|
// matches needs to be counted in the crop len.
|
||||||
let mut remaining_words =
|
let mut remaining_words =
|
||||||
self.crop_size + first_match_word_position - last_match_word_position;
|
self.crop_size + first_match_word_position - last_match_word_position;
|
||||||
// if first token is a word, then remove 1 to remaining_words.
|
// if first token is a word, then remove 1 to remaining_words.
|
||||||
if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
|
if let Some(None) = self.tokens.get(first_match_token_position).map(|t| t.is_separator()) {
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we start from matches positions, then we expand the window in both sides.
|
||||||
let mut first_token_position = first_match_token_position;
|
let mut first_token_position = first_match_token_position;
|
||||||
let mut last_token_position = last_match_token_position;
|
let mut last_token_position = last_match_token_position;
|
||||||
|
|
||||||
while remaining_words > 0 {
|
while remaining_words > 0 {
|
||||||
match (
|
match (
|
||||||
|
// try to expand left
|
||||||
first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
|
first_token_position.checked_sub(1).and_then(|i| self.tokens.get(i)),
|
||||||
|
// try to expand right
|
||||||
last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
|
last_token_position.checked_add(1).and_then(|i| self.tokens.get(i)),
|
||||||
) {
|
) {
|
||||||
|
// we can expand both sides.
|
||||||
(Some(ft), Some(lt)) => {
|
(Some(ft), Some(lt)) => {
|
||||||
match (ft.is_separator(), lt.is_separator()) {
|
match (ft.is_separator(), lt.is_separator()) {
|
||||||
// if they are both separators and are the same kind then advance both
|
// if they are both separators and are the same kind then advance both,
|
||||||
|
// or expand in the soft separator separator side.
|
||||||
(Some(f_kind), Some(s_kind)) => {
|
(Some(f_kind), Some(s_kind)) => {
|
||||||
if f_kind == s_kind {
|
if f_kind == s_kind {
|
||||||
first_token_position -= 1;
|
first_token_position -= 1;
|
||||||
@ -233,17 +271,18 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
first_token_position -= 1;
|
first_token_position -= 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// left is a word, advance left
|
// if one of the tokens is a word, we expend in the side of the word.
|
||||||
|
// left is a word, advance left.
|
||||||
(None, Some(_)) => {
|
(None, Some(_)) => {
|
||||||
first_token_position -= 1;
|
first_token_position -= 1;
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
// right is a word, advance right
|
// right is a word, advance right.
|
||||||
(Some(_), None) => {
|
(Some(_), None) => {
|
||||||
last_token_position += 1;
|
last_token_position += 1;
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
}
|
}
|
||||||
// both are words, advance left then right if remaining_word > 0
|
// both are words, advance left then right if remaining_word > 0.
|
||||||
(None, None) => {
|
(None, None) => {
|
||||||
first_token_position -= 1;
|
first_token_position -= 1;
|
||||||
remaining_words -= 1;
|
remaining_words -= 1;
|
||||||
@ -277,6 +316,10 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
(first_token_position, last_token_position)
|
(first_token_position, last_token_position)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compute the score of a match interval:
|
||||||
|
/// 1) count unique matches
|
||||||
|
/// 2) calculate distance between matches
|
||||||
|
/// 3) count ordered matches
|
||||||
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
|
fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
|
||||||
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
|
let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
|
||||||
let mut order_score = 0;
|
let mut order_score = 0;
|
||||||
@ -305,14 +348,20 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
(uniq_score, distance_score, order_score)
|
(uniq_score, distance_score, order_score)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the matches interval where the score computed by match_interval_score is maximal.
|
||||||
fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
|
fn find_best_match_interval<'a>(&self, matches: &'a [Match]) -> &'a [Match] {
|
||||||
|
// we compute the matches interval if we have at least 2 matches.
|
||||||
if matches.len() > 1 {
|
if matches.len() > 1 {
|
||||||
|
// positions of the first and the last match of the best matches interval in `matches`.
|
||||||
let mut best_interval = (0, 0);
|
let mut best_interval = (0, 0);
|
||||||
let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
|
let mut best_interval_score = self.match_interval_score(&matches[0..=0]);
|
||||||
|
// current interval positions.
|
||||||
let mut interval_first = 0;
|
let mut interval_first = 0;
|
||||||
let mut interval_last = 0;
|
let mut interval_last = 0;
|
||||||
for (index, next_match) in matches.iter().enumerate().skip(1) {
|
for (index, next_match) in matches.iter().enumerate().skip(1) {
|
||||||
// if next match would make interval gross more than crop_size
|
// if next match would make interval gross more than crop_size,
|
||||||
|
// we compare the current interval with the best one,
|
||||||
|
// then we increase `interval_first` until next match can be added.
|
||||||
if next_match.word_position - matches[interval_first].word_position
|
if next_match.word_position - matches[interval_first].word_position
|
||||||
>= self.crop_size
|
>= self.crop_size
|
||||||
{
|
{
|
||||||
@ -325,7 +374,7 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
best_interval_score = interval_score;
|
best_interval_score = interval_score;
|
||||||
}
|
}
|
||||||
|
|
||||||
// advance start of the interval while interval is longer than crop_size
|
// advance start of the interval while interval is longer than crop_size.
|
||||||
while next_match.word_position - matches[interval_first].word_position
|
while next_match.word_position - matches[interval_first].word_position
|
||||||
>= self.crop_size
|
>= self.crop_size
|
||||||
{
|
{
|
||||||
@ -335,6 +384,7 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
interval_last = index;
|
interval_last = index;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// compute the last interval score and compare it to the best one.
|
||||||
let interval_score =
|
let interval_score =
|
||||||
self.match_interval_score(&matches[interval_first..=interval_last]);
|
self.match_interval_score(&matches[interval_first..=interval_last]);
|
||||||
if interval_score > best_interval_score {
|
if interval_score > best_interval_score {
|
||||||
@ -347,6 +397,7 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the bounds in byte index of the crop window.
|
||||||
fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
|
fn crop_bounds(&self, matches: &[Match]) -> (usize, usize) {
|
||||||
let match_interval = self.find_best_match_interval(matches);
|
let match_interval = self.find_best_match_interval(matches);
|
||||||
|
|
||||||
@ -357,12 +408,13 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
(byte_start, byte_end)
|
(byte_start, byte_end)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns the formatted version of the original text.
|
||||||
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
|
pub fn format(&mut self, highlight: bool, crop: bool) -> Cow<'t, str> {
|
||||||
// If 0 it will be considered null and thus not crop the field
|
// If 0 it will be considered null and thus not crop the field
|
||||||
// https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
// https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||||
let crop = crop && self.crop_size > 0;
|
let crop = crop && self.crop_size > 0;
|
||||||
if !highlight && !crop {
|
if !highlight && !crop {
|
||||||
// compute matches is not needed if no highlight or crop is requested.
|
// compute matches is not needed if no highlight nor crop is requested.
|
||||||
Cow::Borrowed(self.text)
|
Cow::Borrowed(self.text)
|
||||||
} else {
|
} else {
|
||||||
match &self.matches {
|
match &self.matches {
|
||||||
@ -397,12 +449,14 @@ impl<'t> Matcher<'t, '_> {
|
|||||||
.char_indices()
|
.char_indices()
|
||||||
.enumerate()
|
.enumerate()
|
||||||
.find(|(i, _)| *i == m.match_len)
|
.find(|(i, _)| *i == m.match_len)
|
||||||
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start)
|
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
|
||||||
.min(token.byte_end);
|
|
||||||
formatted.push(self.highlight_prefix);
|
formatted.push(self.highlight_prefix);
|
||||||
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
|
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
|
||||||
formatted.push(self.highlight_suffix);
|
formatted.push(self.highlight_suffix);
|
||||||
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
|
// if it's a prefix highlight, we put the end of the word after the highlight marker.
|
||||||
|
if highlight_byte_index < token.byte_end {
|
||||||
|
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
|
||||||
|
}
|
||||||
|
|
||||||
byte_index = token.byte_end;
|
byte_index = token.byte_end;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user