Change Matcher so that phrases are counted as one instead of word by word

This commit is contained in:
F. Levi 2024-09-12 09:44:37 +03:00
parent 02c2b660f8
commit edcb4c60ba

View File

@ -132,37 +132,21 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
mut partial: PartialMatch<'a>,
token_position: usize,
word_position: usize,
first_word_char_start: &usize,
words_positions: &mut impl Iterator<Item = (usize, usize, &'a Token<'a>)>,
matches: &mut Vec<Match>,
) -> bool {
let mut potential_matches = vec![(token_position, word_position, partial.char_len())];
for (token_position, word_position, word) in words_positions {
for (_, _, word) in words_positions {
partial = match partial.match_token(word) {
// token matches the partial match, but the match is not full,
// we temporarily save the current token then we try to match the next one.
Some(MatchType::Partial(partial)) => {
potential_matches.push((token_position, word_position, partial.char_len()));
partial
}
Some(MatchType::Partial(partial)) => partial,
// partial match is now full, we keep this matches and we advance positions
Some(MatchType::Full { char_len, ids }) => {
let ids: Vec<_> = ids.clone().collect();
// save previously matched tokens as matches.
let iter = potential_matches.into_iter().map(
|(token_position, word_position, match_len)| Match {
match_len,
ids: ids.clone(),
word_position,
token_position,
},
);
matches.extend(iter);
Some(MatchType::Full { ids, .. }) => {
// save the token that closes the partial match as a match.
matches.push(Match {
match_len: char_len,
ids,
match_len: word.char_end - first_word_char_start,
ids: ids.clone().collect(),
word_position,
token_position,
});
@ -221,6 +205,7 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
partial,
token_position,
word_position,
&word.char_start,
&mut wp,
&mut matches,
) {
@ -472,15 +457,17 @@ impl<'t, 'tokenizer> Matcher<'t, 'tokenizer, '_, '_> {
.enumerate()
.find(|(i, _)| *i == m.match_len)
.map_or(token.byte_end, |(_, (i, _))| i + token.byte_start);
formatted.push(self.highlight_prefix);
formatted.push(&self.text[token.byte_start..highlight_byte_index]);
formatted.push(self.highlight_suffix);
// if it's a prefix highlight, we put the end of the word after the highlight marker.
if highlight_byte_index < token.byte_end {
formatted.push(&self.text[highlight_byte_index..token.byte_end]);
}
byte_index = token.byte_end;
byte_index = token.byte_start + m.match_len;
}
}
@ -821,22 +808,24 @@ mod tests {
fn format_highlight_crop_phrase_query() {
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
let temp_index = TempIndex::new();
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
temp_index
.add_documents(documents!([
{ "id": 1, "text": "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!" }
{ "id": 1, "text": text }
]))
.unwrap();
let rtxn = temp_index.read_txn().unwrap();
let format_options = FormatOptions { highlight: true, crop: Some(10) };
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "\"the world\"");
let mut matcher = builder.build(text, None);
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
insta::assert_snapshot!(
matcher.format(format_options),
@"…had the power to split <em>the</em> <em>world</em> between those who…"
@"…had the power to split <em>the world</em> between those who…"
);
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "those \"and those\"");
@ -844,7 +833,7 @@ mod tests {
// should highlight "those" and the phrase "and those".
insta::assert_snapshot!(
matcher.format(format_options),
@"…world between <em>those</em> who embraced progress <em>and</em> <em>those</em> who resisted…"
@"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"
);
}
@ -900,7 +889,7 @@ mod tests {
let mut matcher = builder.build(text, None);
insta::assert_snapshot!(
matcher.format(format_options),
@"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
@"_the_ _do or_ die can't be he do and or isn'_t he_"
);
}
}