mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-04 20:37:15 +02:00
Refactor, fix, misc
This commit is contained in:
parent
24f213c343
commit
561b4836d8
9 changed files with 689 additions and 569 deletions
BIN
crates/milli/.tmp4e121b/data.mdb
Normal file
BIN
crates/milli/.tmp4e121b/data.mdb
Normal file
Binary file not shown.
BIN
crates/milli/.tmp4e121b/lock.mdb
Normal file
BIN
crates/milli/.tmp4e121b/lock.mdb
Normal file
Binary file not shown.
BIN
crates/milli/.tmpNxMsye/data.mdb
Normal file
BIN
crates/milli/.tmpNxMsye/data.mdb
Normal file
Binary file not shown.
BIN
crates/milli/.tmpNxMsye/lock.mdb
Normal file
BIN
crates/milli/.tmpNxMsye/lock.mdb
Normal file
Binary file not shown.
|
@ -1,152 +1,153 @@
|
|||
use std::cmp::Ordering;
|
||||
|
||||
use charabia::{SeparatorKind, Token, TokenKind};
|
||||
use charabia::{SeparatorKind, Token};
|
||||
|
||||
enum SimpleTokenKind {
|
||||
Separator(SeparatorKind),
|
||||
NonSeparator,
|
||||
Done,
|
||||
#[derive(Clone)]
|
||||
enum Direction {
|
||||
Forwards,
|
||||
Backwards,
|
||||
}
|
||||
|
||||
impl SimpleTokenKind {
|
||||
fn new(token: &Token) -> Self {
|
||||
match token.kind {
|
||||
TokenKind::Separator(separator_kind) => Self::Separator(separator_kind),
|
||||
_ => Self::NonSeparator,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct CropBoundsHelper<'a> {
|
||||
tokens: &'a [Token<'a>],
|
||||
index_backward: usize,
|
||||
backward_token_kind: SimpleTokenKind,
|
||||
index_forward: usize,
|
||||
forward_token_kind: SimpleTokenKind,
|
||||
}
|
||||
|
||||
impl CropBoundsHelper<'_> {
|
||||
fn advance_backward(&mut self) {
|
||||
if matches!(self.backward_token_kind, SimpleTokenKind::Done) {
|
||||
return;
|
||||
}
|
||||
|
||||
if self.index_backward != 0 {
|
||||
self.index_backward -= 1;
|
||||
self.backward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_backward]);
|
||||
} else {
|
||||
self.backward_token_kind = SimpleTokenKind::Done;
|
||||
}
|
||||
}
|
||||
|
||||
fn advance_forward(&mut self) {
|
||||
if matches!(self.forward_token_kind, SimpleTokenKind::Done) {
|
||||
return;
|
||||
}
|
||||
|
||||
if self.index_forward != self.tokens.len() - 1 {
|
||||
self.index_forward += 1;
|
||||
self.forward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_forward]);
|
||||
} else {
|
||||
self.forward_token_kind = SimpleTokenKind::Done;
|
||||
impl Direction {
|
||||
fn switch(&mut self) {
|
||||
*self = match self {
|
||||
Direction::Backwards => Direction::Forwards,
|
||||
Direction::Forwards => Direction::Backwards,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn get_adjusted_indices_for_too_few_words(
|
||||
tokens: &[Token],
|
||||
index_backward: usize,
|
||||
index_forward: usize,
|
||||
mut index_backward: usize,
|
||||
mut index_forward: usize,
|
||||
mut words_count: usize,
|
||||
crop_size: usize,
|
||||
) -> [usize; 2] {
|
||||
let crop_size = crop_size + 2;
|
||||
let mut cbh = CropBoundsHelper {
|
||||
tokens,
|
||||
index_backward,
|
||||
backward_token_kind: SimpleTokenKind::new(&tokens[index_backward]),
|
||||
index_forward,
|
||||
forward_token_kind: SimpleTokenKind::new(&tokens[index_forward]),
|
||||
};
|
||||
let mut valid_index_backward = index_backward;
|
||||
let mut valid_index_forward = index_forward;
|
||||
|
||||
let mut is_end_reached = index_forward == tokens.len() - 1;
|
||||
let mut is_beginning_reached = index_backward == 0;
|
||||
|
||||
let mut is_index_backwards_at_hard_separator = false;
|
||||
let mut is_index_forwards_at_hard_separator = false;
|
||||
|
||||
// false + ends reached because TODO
|
||||
let mut is_crop_size_or_both_ends_reached = is_end_reached && is_beginning_reached;
|
||||
|
||||
let mut dir = Direction::Forwards;
|
||||
|
||||
loop {
|
||||
match [&cbh.backward_token_kind, &cbh.forward_token_kind] {
|
||||
// if they are both separators and are the same kind then advance both,
|
||||
// or expand in the soft separator side
|
||||
[SimpleTokenKind::Separator(backward_sk), SimpleTokenKind::Separator(forward_sk)] => {
|
||||
if backward_sk == forward_sk {
|
||||
cbh.advance_backward();
|
||||
if is_crop_size_or_both_ends_reached {
|
||||
break;
|
||||
}
|
||||
|
||||
// this avoids having an ending separator before crop marker
|
||||
if words_count < crop_size - 1 {
|
||||
cbh.advance_forward();
|
||||
let (index, valid_index) = match dir {
|
||||
Direction::Backwards => (&mut index_backward, &mut valid_index_backward),
|
||||
Direction::Forwards => (&mut index_forward, &mut valid_index_forward),
|
||||
};
|
||||
|
||||
loop {
|
||||
match dir {
|
||||
Direction::Forwards => {
|
||||
if is_end_reached {
|
||||
break;
|
||||
}
|
||||
} else if matches!(backward_sk, SeparatorKind::Hard) {
|
||||
cbh.advance_forward();
|
||||
} else {
|
||||
cbh.advance_backward();
|
||||
|
||||
*index += 1;
|
||||
|
||||
is_end_reached = *index == tokens.len() - 1;
|
||||
}
|
||||
Direction::Backwards => {
|
||||
if is_beginning_reached
|
||||
|| (!is_end_reached
|
||||
&& is_index_backwards_at_hard_separator
|
||||
&& !is_index_forwards_at_hard_separator)
|
||||
{
|
||||
break;
|
||||
}
|
||||
|
||||
*index -= 1;
|
||||
|
||||
is_beginning_reached = *index == 0;
|
||||
}
|
||||
};
|
||||
|
||||
if is_end_reached && is_beginning_reached {
|
||||
is_crop_size_or_both_ends_reached = true;
|
||||
}
|
||||
// both are words, advance left then right if we haven't reached `crop_size`
|
||||
[SimpleTokenKind::NonSeparator, SimpleTokenKind::NonSeparator] => {
|
||||
cbh.advance_backward();
|
||||
|
||||
let maybe_is_token_hard_separator = tokens[*index]
|
||||
.separator_kind()
|
||||
.map(|sep_kind| matches!(sep_kind, SeparatorKind::Hard));
|
||||
|
||||
// it's not a separator
|
||||
if maybe_is_token_hard_separator.is_none() {
|
||||
*valid_index = *index;
|
||||
words_count += 1;
|
||||
|
||||
if words_count != crop_size {
|
||||
cbh.advance_forward();
|
||||
words_count += 1;
|
||||
if words_count == crop_size {
|
||||
is_crop_size_or_both_ends_reached = true;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
[SimpleTokenKind::Done, SimpleTokenKind::Done] => break,
|
||||
// if one of the tokens is non-separator and the other a separator, we expand in the non-separator side
|
||||
// if one of the sides reached the end, we expand in the opposite direction
|
||||
[backward_stk, SimpleTokenKind::Done]
|
||||
| [backward_stk @ SimpleTokenKind::NonSeparator, SimpleTokenKind::Separator(_)] => {
|
||||
if matches!(backward_stk, SimpleTokenKind::NonSeparator) {
|
||||
words_count += 1;
|
||||
}
|
||||
cbh.advance_backward();
|
||||
}
|
||||
[SimpleTokenKind::Done, forward_stk]
|
||||
| [SimpleTokenKind::Separator(_), forward_stk @ SimpleTokenKind::NonSeparator] => {
|
||||
if matches!(forward_stk, SimpleTokenKind::NonSeparator) {
|
||||
words_count += 1;
|
||||
}
|
||||
cbh.advance_forward();
|
||||
}
|
||||
|
||||
let is_index_at_hard_separator = match dir {
|
||||
Direction::Backwards => &mut is_index_backwards_at_hard_separator,
|
||||
Direction::Forwards => &mut is_index_forwards_at_hard_separator,
|
||||
};
|
||||
*is_index_at_hard_separator =
|
||||
maybe_is_token_hard_separator.is_some_and(|is_hard| is_hard);
|
||||
}
|
||||
|
||||
dir.switch();
|
||||
|
||||
// 1. if end is reached, we can only advance backwards
|
||||
// 2. if forwards index reached a hard separator and backwards is currently hard, we can go backwards
|
||||
}
|
||||
|
||||
// keep advancing forward to check if there's only separator tokens left until the end
|
||||
// if so, then include those too in the index range
|
||||
let mut try_index_forward = valid_index_forward + 1;
|
||||
while let Some(token) = tokens.get(try_index_forward) {
|
||||
if !token.is_separator() {
|
||||
return [valid_index_backward, valid_index_forward];
|
||||
}
|
||||
|
||||
try_index_forward += 1;
|
||||
}
|
||||
|
||||
[valid_index_backward, try_index_forward - 1]
|
||||
}
|
||||
|
||||
fn get_adjusted_index_forward_for_too_many_words(
|
||||
tokens: &[Token],
|
||||
index_backward: usize,
|
||||
mut index_forward: usize,
|
||||
mut words_count: usize,
|
||||
crop_size: usize,
|
||||
) -> usize {
|
||||
loop {
|
||||
if index_forward == index_backward {
|
||||
return index_forward;
|
||||
}
|
||||
|
||||
index_forward -= 1;
|
||||
|
||||
if tokens[index_forward].is_separator() {
|
||||
continue;
|
||||
}
|
||||
|
||||
words_count -= 1;
|
||||
|
||||
if words_count == crop_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
[cbh.index_backward, cbh.index_forward]
|
||||
}
|
||||
|
||||
fn get_adjusted_index_forward_for_too_many_words(
|
||||
tokens: &[Token],
|
||||
mut index_forward: usize,
|
||||
mut words_count: usize,
|
||||
crop_size: usize,
|
||||
) -> usize {
|
||||
while index_forward != 0 {
|
||||
if matches!(SimpleTokenKind::new(&tokens[index_forward]), SimpleTokenKind::NonSeparator) {
|
||||
words_count -= 1;
|
||||
|
||||
if words_count == crop_size {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
index_forward -= 1;
|
||||
}
|
||||
|
||||
if index_forward == 0 {
|
||||
return index_forward;
|
||||
}
|
||||
|
||||
index_forward - 1
|
||||
index_forward
|
||||
}
|
||||
|
||||
pub fn get_adjusted_indices_for_highlights_and_crop_size(
|
||||
|
@ -164,14 +165,12 @@ pub fn get_adjusted_indices_for_highlights_and_crop_size(
|
|||
words_count,
|
||||
crop_size,
|
||||
),
|
||||
Ordering::Equal => [
|
||||
if index_backward != 0 { index_backward - 1 } else { index_backward },
|
||||
if index_forward != tokens.len() - 1 { index_forward + 1 } else { index_forward },
|
||||
],
|
||||
Ordering::Equal => [index_backward, index_forward],
|
||||
Ordering::Greater => [
|
||||
index_backward,
|
||||
get_adjusted_index_forward_for_too_many_words(
|
||||
tokens,
|
||||
index_backward,
|
||||
index_forward,
|
||||
words_count,
|
||||
crop_size,
|
||||
|
@ -185,7 +184,7 @@ pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usi
|
|||
let mut index = 0;
|
||||
|
||||
while index != tokens.len() - 1 {
|
||||
if matches!(SimpleTokenKind::new(&tokens[index]), SimpleTokenKind::NonSeparator) {
|
||||
if !tokens[index].is_separator() {
|
||||
words_count += 1;
|
||||
|
||||
if words_count == crop_size {
|
||||
|
|
|
@ -14,6 +14,7 @@ use utoipa::ToSchema;
|
|||
|
||||
use super::FormatOptions;
|
||||
|
||||
// TODO: Differentiate if full match do not return None, instead return match bounds with full length
|
||||
#[derive(Serialize, ToSchema)]
|
||||
#[serde(rename_all = "camelCase")]
|
||||
pub struct MatchBounds {
|
||||
|
@ -158,20 +159,28 @@ impl MatchBoundsHelper<'_> {
|
|||
}
|
||||
|
||||
/// For crop but no highlight.
|
||||
fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> Option<MatchBounds> {
|
||||
fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> MatchBounds {
|
||||
let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size);
|
||||
let final_token = &self.tokens[final_token_index];
|
||||
|
||||
if final_token_index == self.tokens.len() - 1 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start?
|
||||
|
||||
Some(MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] })
|
||||
// TODO: Can here be an error, because it's byte_start but it could be byte_end?
|
||||
MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] }
|
||||
}
|
||||
|
||||
fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices {
|
||||
let asd = |i1, i2| {
|
||||
println!(
|
||||
"{}|{}|{}\n{} {}",
|
||||
self.tokens[..i1].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
|
||||
self.tokens[i1..i2].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
|
||||
self.tokens[i2..].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
|
||||
i1,
|
||||
i2
|
||||
);
|
||||
};
|
||||
|
||||
// TODO: This doesn't give back 2 phrases if one is out of crop window
|
||||
// Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them
|
||||
let [matches_first_index, matches_last_index] =
|
||||
|
@ -196,28 +205,17 @@ impl MatchBoundsHelper<'_> {
|
|||
crop_size,
|
||||
);
|
||||
|
||||
let is_index_backward_at_limit = index_backward == 0;
|
||||
let is_index_forward_at_limit = index_forward == self.tokens.len() - 1;
|
||||
asd(first_match.get_first_token_pos(), last_match.get_last_token_pos());
|
||||
asd(index_backward, index_forward);
|
||||
|
||||
let backward_token = &self.tokens[index_backward];
|
||||
let crop_byte_start = if is_index_backward_at_limit {
|
||||
backward_token.byte_start
|
||||
} else {
|
||||
backward_token.byte_end
|
||||
};
|
||||
|
||||
let forward_token = &self.tokens[index_forward];
|
||||
let crop_byte_end = if is_index_forward_at_limit {
|
||||
forward_token.byte_end
|
||||
} else {
|
||||
forward_token.byte_start
|
||||
};
|
||||
|
||||
MatchesAndCropIndices {
|
||||
matches_first_index,
|
||||
matches_last_index,
|
||||
crop_byte_start,
|
||||
crop_byte_end,
|
||||
crop_byte_start: backward_token.byte_start,
|
||||
crop_byte_end: forward_token.byte_end,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -248,7 +246,7 @@ impl MatchBounds {
|
|||
|
||||
if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) {
|
||||
if matches.is_empty() {
|
||||
return mbh.get_crop_bounds_with_no_matches(crop_size);
|
||||
return Some(mbh.get_crop_bounds_with_no_matches(crop_size));
|
||||
}
|
||||
|
||||
if format_options.highlight {
|
||||
|
@ -258,15 +256,15 @@ impl MatchBounds {
|
|||
return Some(mbh.get_crop_bounds_with_matches(crop_size));
|
||||
}
|
||||
|
||||
if format_options.highlight && !matches.is_empty() {
|
||||
Some(mbh.get_match_bounds(MatchesAndCropIndices {
|
||||
matches_first_index: 0,
|
||||
matches_last_index: matches.len() - 1,
|
||||
crop_byte_start: 0,
|
||||
crop_byte_end: tokens[tokens.len() - 1].byte_end,
|
||||
}))
|
||||
} else {
|
||||
None
|
||||
if !format_options.highlight || matches.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
Some(mbh.get_match_bounds(MatchesAndCropIndices {
|
||||
matches_first_index: 0,
|
||||
matches_last_index: matches.len() - 1,
|
||||
crop_byte_start: 0,
|
||||
crop_byte_end: tokens[tokens.len() - 1].byte_end,
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
|
|
@ -115,17 +115,21 @@ impl MatchingWords {
|
|||
|
||||
let position = [*positions.start(), *positions.end()];
|
||||
|
||||
located_matching_phrases.reserve(matching_phrases.len());
|
||||
located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
|
||||
LocatedMatchingPhrase { value: *matching_phrase, position }
|
||||
}));
|
||||
if !matching_phrases.is_empty() {
|
||||
located_matching_phrases.reserve(matching_phrases.len());
|
||||
located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
|
||||
LocatedMatchingPhrase { value: *matching_phrase, position }
|
||||
}));
|
||||
}
|
||||
|
||||
located_matching_words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
position,
|
||||
is_prefix: term.is_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
if !matching_words.is_empty() {
|
||||
located_matching_words.push(LocatedMatchingWords {
|
||||
value: matching_words,
|
||||
position,
|
||||
is_prefix: term.is_prefix(),
|
||||
original_char_count: term.original_word(&ctx).chars().count(),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// Sort words by having `is_prefix` as false first and then by their lengths in reverse order.
|
||||
|
@ -147,12 +151,11 @@ impl MatchingWords {
|
|||
token_position_helper_iter: &mut (impl Iterator<Item = TokenPositionHelper<'a>> + Clone),
|
||||
) -> Option<(Match, UserQueryPositionRange)> {
|
||||
let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| {
|
||||
let words_iter = self
|
||||
.phrase_interner
|
||||
.get(lmp.value)
|
||||
.words
|
||||
let words = &self.phrase_interner.get(lmp.value).words;
|
||||
|
||||
let words_iter = words
|
||||
.iter()
|
||||
.map(|word_option| word_option.map(|word| self.word_interner.get(word).as_str()))
|
||||
.map(|maybe_word| maybe_word.map(|word| self.word_interner.get(word).as_str()))
|
||||
.peekable();
|
||||
|
||||
(lmp.position, words_iter)
|
||||
|
@ -161,7 +164,7 @@ impl MatchingWords {
|
|||
'outer: loop {
|
||||
let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?;
|
||||
|
||||
// TODO: Is it worth only cloning if we have to?
|
||||
// TODO: if it's worth it, clone only if we have to
|
||||
let mut tph_iter = token_position_helper_iter.clone();
|
||||
|
||||
let mut first_tph_details = None;
|
||||
|
@ -241,46 +244,50 @@ impl MatchingWords {
|
|||
tph: TokenPositionHelper,
|
||||
text: &str,
|
||||
) -> Option<(Match, UserQueryPositionRange)> {
|
||||
let mut iter =
|
||||
self.located_matching_words.iter().flat_map(|lw| lw.value.iter().map(move |w| (lw, w)));
|
||||
// TODO: There is potentially an optimization to be made here
|
||||
// if we matched a term then we can skip checking it for further iterations?
|
||||
|
||||
loop {
|
||||
let (located_words, word) = iter.next()?;
|
||||
let word = self.word_interner.get(*word);
|
||||
self.located_matching_words
|
||||
.iter()
|
||||
.flat_map(|lw| lw.value.iter().map(move |w| (lw, w)))
|
||||
.find_map(|(located_words, word)| {
|
||||
let word = self.word_interner.get(*word);
|
||||
|
||||
let [char_count, byte_len] =
|
||||
match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) {
|
||||
PrefixedOrEquality::Prefixed => {
|
||||
let prefix_byte_len = text[tph.token.byte_start..]
|
||||
.char_indices()
|
||||
.nth(located_words.original_char_count - 1)
|
||||
.map(|(i, c)| i + c.len_utf8())
|
||||
.expect("expected text to have n-th thing bal bla TODO");
|
||||
let [char_count, byte_len] =
|
||||
match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix)
|
||||
{
|
||||
PrefixedOrEquality::Prefixed => {
|
||||
let prefix_byte_len = text[tph.token.byte_start..]
|
||||
.char_indices()
|
||||
.nth(located_words.original_char_count - 1)
|
||||
.map(|(i, c)| i + c.len_utf8())
|
||||
.expect("expected text to have n-th thing bal bla TODO");
|
||||
|
||||
// TODO: Investigate token original byte length and similar methods and why they're not good enough
|
||||
// TODO: Investigate token original byte length and similar methods and why they're not good enough
|
||||
// That might be because token original byte length only or could also refer to the normalized byte length
|
||||
|
||||
[located_words.original_char_count, prefix_byte_len]
|
||||
}
|
||||
// do not +1, because Token index ranges are exclusive
|
||||
PrefixedOrEquality::Equality => [
|
||||
tph.token.char_end - tph.token.char_start,
|
||||
tph.token.byte_end - tph.token.byte_start,
|
||||
],
|
||||
_ => continue,
|
||||
};
|
||||
[located_words.original_char_count, prefix_byte_len]
|
||||
}
|
||||
// do not +1, because Token index ranges are exclusive
|
||||
PrefixedOrEquality::Equality => [
|
||||
tph.token.char_end - tph.token.char_start,
|
||||
tph.token.byte_end - tph.token.byte_start,
|
||||
],
|
||||
_ => return None,
|
||||
};
|
||||
|
||||
return Some((
|
||||
Match {
|
||||
char_count,
|
||||
byte_len,
|
||||
position: MatchPosition::Word {
|
||||
word_position: tph.position_by_word,
|
||||
token_position: tph.position_by_token,
|
||||
Some((
|
||||
Match {
|
||||
char_count,
|
||||
byte_len,
|
||||
position: MatchPosition::Word {
|
||||
word_position: tph.position_by_word,
|
||||
token_position: tph.position_by_token,
|
||||
},
|
||||
},
|
||||
},
|
||||
located_words.position,
|
||||
));
|
||||
}
|
||||
located_words.position,
|
||||
))
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get_matches_and_query_positions(
|
||||
|
@ -361,93 +368,93 @@ impl Debug for MatchingWords {
|
|||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub(crate) mod tests {
|
||||
use super::super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::search::new::matches::tests::temp_index_with_documents;
|
||||
use crate::search::new::query_term::ExtractedTokens;
|
||||
use charabia::{TokenKind, TokenizerBuilder};
|
||||
use std::borrow::Cow;
|
||||
// #[cfg(test)]
|
||||
// pub(crate) mod tests {
|
||||
// use super::super::super::located_query_terms_from_tokens;
|
||||
// use super::*;
|
||||
// use crate::search::new::matches::tests::temp_index_with_documents;
|
||||
// use crate::search::new::query_term::ExtractedTokens;
|
||||
// use charabia::{TokenKind, TokenizerBuilder};
|
||||
// use std::borrow::Cow;
|
||||
|
||||
#[test]
|
||||
fn matching_words() {
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let text = "split this world";
|
||||
let tokens = tokenizer.tokenize(text);
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, &query_terms);
|
||||
// #[test]
|
||||
// fn matching_words() {
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
|
||||
// let mut builder = TokenizerBuilder::default();
|
||||
// let tokenizer = builder.build();
|
||||
// let text = "split this world";
|
||||
// let tokens = tokenizer.tokenize(text);
|
||||
// let ExtractedTokens { query_terms, .. } =
|
||||
// located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
// let matching_words = MatchingWords::new(ctx, &query_terms);
|
||||
|
||||
assert_eq!(
|
||||
matching_words.get_matches_and_query_positions(
|
||||
&[
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("split"),
|
||||
char_end: "split".chars().count(),
|
||||
byte_end: "split".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("nyc"),
|
||||
char_end: "nyc".chars().count(),
|
||||
byte_end: "nyc".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("world"),
|
||||
char_end: "world".chars().count(),
|
||||
byte_end: "world".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("worlded"),
|
||||
char_end: "worlded".chars().count(),
|
||||
byte_end: "worlded".len(),
|
||||
..Default::default()
|
||||
},
|
||||
Token {
|
||||
kind: TokenKind::Word,
|
||||
lemma: Cow::Borrowed("thisnew"),
|
||||
char_end: "thisnew".chars().count(),
|
||||
byte_end: "thisnew".len(),
|
||||
..Default::default()
|
||||
}
|
||||
],
|
||||
text
|
||||
),
|
||||
(
|
||||
vec![
|
||||
Match {
|
||||
char_count: 5,
|
||||
byte_len: 5,
|
||||
position: MatchPosition::Word { word_position: 0, token_position: 0 }
|
||||
},
|
||||
Match {
|
||||
char_count: 5,
|
||||
byte_len: 5,
|
||||
position: MatchPosition::Word { word_position: 2, token_position: 2 }
|
||||
},
|
||||
Match {
|
||||
char_count: 5,
|
||||
byte_len: 5,
|
||||
position: MatchPosition::Word { word_position: 3, token_position: 3 }
|
||||
}
|
||||
],
|
||||
vec![
|
||||
QueryPosition { range: [0, 0], index: 0 },
|
||||
QueryPosition { range: [2, 2], index: 1 },
|
||||
QueryPosition { range: [2, 2], index: 2 }
|
||||
]
|
||||
)
|
||||
);
|
||||
}
|
||||
}
|
||||
// assert_eq!(
|
||||
// matching_words.get_matches_and_query_positions(
|
||||
// &[
|
||||
// Token {
|
||||
// kind: TokenKind::Word,
|
||||
// lemma: Cow::Borrowed("split"),
|
||||
// char_end: "split".chars().count(),
|
||||
// byte_end: "split".len(),
|
||||
// ..Default::default()
|
||||
// },
|
||||
// Token {
|
||||
// kind: TokenKind::Word,
|
||||
// lemma: Cow::Borrowed("nyc"),
|
||||
// char_end: "nyc".chars().count(),
|
||||
// byte_end: "nyc".len(),
|
||||
// ..Default::default()
|
||||
// },
|
||||
// Token {
|
||||
// kind: TokenKind::Word,
|
||||
// lemma: Cow::Borrowed("world"),
|
||||
// char_end: "world".chars().count(),
|
||||
// byte_end: "world".len(),
|
||||
// ..Default::default()
|
||||
// },
|
||||
// Token {
|
||||
// kind: TokenKind::Word,
|
||||
// lemma: Cow::Borrowed("worlded"),
|
||||
// char_end: "worlded".chars().count(),
|
||||
// byte_end: "worlded".len(),
|
||||
// ..Default::default()
|
||||
// },
|
||||
// Token {
|
||||
// kind: TokenKind::Word,
|
||||
// lemma: Cow::Borrowed("thisnew"),
|
||||
// char_end: "thisnew".chars().count(),
|
||||
// byte_end: "thisnew".len(),
|
||||
// ..Default::default()
|
||||
// }
|
||||
// ],
|
||||
// text
|
||||
// ),
|
||||
// (
|
||||
// vec![
|
||||
// Match {
|
||||
// char_count: 5,
|
||||
// byte_len: 5,
|
||||
// position: MatchPosition::Word { word_position: 0, token_position: 0 }
|
||||
// },
|
||||
// Match {
|
||||
// char_count: 5,
|
||||
// byte_len: 5,
|
||||
// position: MatchPosition::Word { word_position: 2, token_position: 2 }
|
||||
// },
|
||||
// Match {
|
||||
// char_count: 5,
|
||||
// byte_len: 5,
|
||||
// position: MatchPosition::Word { word_position: 3, token_position: 3 }
|
||||
// }
|
||||
// ],
|
||||
// vec![
|
||||
// QueryPosition { range: [0, 0], index: 0 },
|
||||
// QueryPosition { range: [2, 2], index: 1 },
|
||||
// QueryPosition { range: [2, 2], index: 2 }
|
||||
// ]
|
||||
// )
|
||||
// );
|
||||
// }
|
||||
// }
|
||||
|
|
|
@ -48,10 +48,9 @@ impl<'a> MatcherBuilder<'a> {
|
|||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Default, Debug)]
|
||||
#[derive(Copy, Clone, Default)]
|
||||
pub struct FormatOptions {
|
||||
pub highlight: bool,
|
||||
// TODO: Should this be usize?
|
||||
pub crop: Option<usize>,
|
||||
}
|
||||
|
||||
|
@ -80,7 +79,9 @@ impl Matcher<'_, '_, '_, '_> {
|
|||
/// TODO: description
|
||||
pub fn get_match_bounds(
|
||||
&mut self,
|
||||
// TODO: Add option to count grapheme clusters instead of bytes
|
||||
// TODO: Add option to count UTF-16 segments, or whatever JS works with when slicing strings
|
||||
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#utf-16_characters_unicode_code_points_and_grapheme_clusters
|
||||
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice
|
||||
format_options: Option<FormatOptions>,
|
||||
) -> Option<MatchBounds> {
|
||||
if self.text.is_empty() {
|
||||
|
@ -152,7 +153,6 @@ mod tests {
|
|||
use crate::index::tests::TempIndex;
|
||||
use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
|
||||
use charabia::TokenizerBuilder;
|
||||
use memmap2::Mmap;
|
||||
|
||||
impl<'a> MatcherBuilder<'a> {
|
||||
fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self {
|
||||
|
@ -180,10 +180,9 @@ mod tests {
|
|||
.unwrap();
|
||||
|
||||
// consume context and located_query_terms to build MatchingWords.
|
||||
let matching_words = match located_query_terms {
|
||||
Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms),
|
||||
None => MatchingWords::default(),
|
||||
};
|
||||
let matching_words = located_query_terms
|
||||
.map(|located_query_terms| MatchingWords::new(ctx, &located_query_terms))
|
||||
.unwrap_or_default();
|
||||
|
||||
MatcherBuilder::new(
|
||||
matching_words,
|
||||
|
@ -197,283 +196,401 @@ mod tests {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn temp_index_with_documents(documents: Option<Mmap>) -> TempIndex {
|
||||
pub fn rename_me(
|
||||
format_options: Option<FormatOptions>,
|
||||
text: &str,
|
||||
query: &str,
|
||||
expected_text: &str,
|
||||
) {
|
||||
let temp_index = TempIndex::new();
|
||||
|
||||
// document will always contain the same exact text normally
|
||||
// TODO: Describe this better and ask if this is actually the case
|
||||
temp_index
|
||||
.add_documents(documents.unwrap_or_else(|| {
|
||||
documents!([
|
||||
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
|
||||
{ "id": 2, "name": "Westfália" },
|
||||
{ "id": 3, "name": "Ŵôřlḑôle" },
|
||||
])
|
||||
}))
|
||||
.add_documents(documents!([
|
||||
{ "id": 1, "text": text.to_string() },
|
||||
]))
|
||||
.unwrap();
|
||||
|
||||
temp_index
|
||||
}
|
||||
|
||||
fn get_expected_maybe_text(expected_text: &str, text: &str) -> Option<String> {
|
||||
if expected_text == text {
|
||||
None
|
||||
} else {
|
||||
Some(expected_text.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_identity() {
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
let format_options = Some(FormatOptions { highlight: false, crop: None });
|
||||
|
||||
let test_values = [
|
||||
// Text without any match.
|
||||
"A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
// Text containing all matches.
|
||||
"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
// Text containing some matches.
|
||||
"Natalie risk her future to build a world with the boy she loves."
|
||||
];
|
||||
|
||||
for text in test_values {
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop and no highlight should return complete text.
|
||||
assert_eq!(matcher.get_formatted_text(format_options), None);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight() {
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
let format_options = Some(FormatOptions { highlight: true, crop: None });
|
||||
|
||||
let test_values = [
|
||||
// empty text.
|
||||
["", ""],
|
||||
// text containing only separators.
|
||||
[":-)", ":-)"],
|
||||
// Text without any match.
|
||||
["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
"A quick brown fox can not jump 32 feet, right? Brr, it is cold!"],
|
||||
// Text containing all matches.
|
||||
["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
|
||||
// Text containing some matches.
|
||||
["Natalie risk her future to build a world with the boy she loves.",
|
||||
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."],
|
||||
];
|
||||
|
||||
for [text, expected_text] in test_values {
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
assert_eq!(
|
||||
matcher.get_formatted_text(format_options),
|
||||
get_expected_maybe_text(expected_text, text)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn highlight_unicode() {
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let format_options = Some(FormatOptions { highlight: true, crop: None });
|
||||
|
||||
let test_values = [
|
||||
// Text containing prefix match.
|
||||
["world", "Ŵôřlḑôle", "<em>Ŵôřlḑ</em>ôle"],
|
||||
// Text containing unicode match.
|
||||
["world", "Ŵôřlḑ", "<em>Ŵôřlḑ</em>"],
|
||||
// Text containing unicode match.
|
||||
["westfali", "Westfália", "<em>Westfáli</em>a"],
|
||||
];
|
||||
|
||||
for [query, text, expected_text] in test_values {
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
assert_eq!(
|
||||
matcher.get_formatted_text(format_options),
|
||||
get_expected_maybe_text(expected_text, text)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_crop() {
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
let format_options = Some(FormatOptions { highlight: false, crop: Some(10) });
|
||||
|
||||
let test_values = [
|
||||
// empty text.
|
||||
["", ""],
|
||||
// text containing only separators.
|
||||
[":-)", ":-)"],
|
||||
// Text without any match.
|
||||
["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
"A quick brown fox can not jump 32 feet, right…"],
|
||||
// Text without any match starting by a separator.
|
||||
["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)",
|
||||
"(A quick brown fox can not jump 32 feet, right…" ],
|
||||
// Test phrase propagation
|
||||
["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.",
|
||||
"…Split The World is a book written by Emily Henry…"],
|
||||
// Text containing some matches.
|
||||
["Natalie risk her future to build a world with the boy she loves.",
|
||||
"…future to build a world with the boy she loves."],
|
||||
// Text containing all matches.
|
||||
["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
"…she loves. Emily Henry: The Love That Split The World."],
|
||||
// Text containing a match unordered and a match ordered.
|
||||
["The world split void void void void void void void void void split the world void void",
|
||||
"…void void void void void split the world void void"],
|
||||
// Text containing matches with different density.
|
||||
["split void the void void world void void void void void void void void void void split the world void void",
|
||||
"…void void void void void split the world void void"],
|
||||
["split split split split split split void void void void void void void void void void split the world void void",
|
||||
"…void void void void void split the world void void"]
|
||||
];
|
||||
|
||||
for [text, expected_text] in test_values {
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
assert_eq!(
|
||||
matcher.get_formatted_text(format_options),
|
||||
get_expected_maybe_text(expected_text, text)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop() {
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
|
||||
|
||||
let test_values = [
|
||||
// empty text.
|
||||
["", ""],
|
||||
// text containing only separators.
|
||||
[":-)", ":-)"],
|
||||
// Text without any match.
|
||||
["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
"A quick brown fox can not jump 32 feet, right…"],
|
||||
// Text containing some matches.
|
||||
["Natalie risk her future to build a world with the boy she loves.",
|
||||
"…future to build a <em>world</em> with <em>the</em> boy she loves."],
|
||||
// Text containing all matches.
|
||||
["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
|
||||
// Text containing a match unordered and a match ordered.
|
||||
["The world split void void void void void void void void void split the world void void",
|
||||
"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"]
|
||||
];
|
||||
|
||||
for [text, expected_text] in test_values {
|
||||
let mut matcher = builder.build(text, None);
|
||||
// no crop should return complete text with highlighted matches.
|
||||
assert_eq!(
|
||||
matcher.get_formatted_text(format_options),
|
||||
get_expected_maybe_text(expected_text, text)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn format_highlight_crop_phrase_query() {
|
||||
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||
let temp_index = temp_index_with_documents(Some(documents!([
|
||||
{ "id": 1, "text": text }
|
||||
])));
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
|
||||
let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
|
||||
|
||||
let test_values = [
|
||||
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||
["\"the world\"",
|
||||
"…the power to split <em>the world</em> between those who embraced…"],
|
||||
// should highlight "those" and the phrase "and those".
|
||||
["those \"and those\"",
|
||||
"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"],
|
||||
["\"The groundbreaking invention had the power to split the world\"",
|
||||
"<em>The groundbreaking invention had the power to split the world</em>…"],
|
||||
["\"The groundbreaking invention had the power to split the world between those\"",
|
||||
"<em>The groundbreaking invention had the power to split the world</em>…"],
|
||||
["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
|
||||
"…between those who <em>embraced progress and those who resisted change</em>!"],
|
||||
["\"groundbreaking invention\" \"split the world between\"",
|
||||
"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"],
|
||||
["\"groundbreaking invention\" \"had the power to split the world between those\"",
|
||||
"…<em>invention</em> <em>had the power to split the world between those</em>…"],
|
||||
];
|
||||
|
||||
for [query, expected_text] in test_values {
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
|
||||
let mut matcher = builder.build(text, None);
|
||||
|
||||
assert_eq!(
|
||||
matcher.get_formatted_text(format_options),
|
||||
get_expected_maybe_text(expected_text, text)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn smaller_crop_size() {
|
||||
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
let text = "void void split the world void void.";
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
|
||||
let mut matcher = builder.build(text, None);
|
||||
|
||||
let test_values = [
|
||||
// set a smaller crop size
|
||||
// because crop size < query size, partially format matches.
|
||||
(2, "…split the…"),
|
||||
// set a smaller crop size
|
||||
// because crop size < query size, partially format matches.
|
||||
(1, "…split…"),
|
||||
// set crop size to 0
|
||||
// because crop size is 0, crop is ignored.
|
||||
(0, "void void split the world void void."),
|
||||
];
|
||||
|
||||
for (crop_size, expected_text) in test_values {
|
||||
// set a smaller crop size
|
||||
let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) });
|
||||
assert_eq!(
|
||||
matcher.get_formatted_text(format_options),
|
||||
get_expected_maybe_text(expected_text, text)
|
||||
);
|
||||
}
|
||||
assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_matches() {
|
||||
let temp_index = temp_index_with_documents(None);
|
||||
let rtxn = temp_index.read_txn().unwrap();
|
||||
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
|
||||
|
||||
let format_options = Some(FormatOptions { highlight: true, crop: None });
|
||||
|
||||
let text = "the do or die can't be he do and or isn't he";
|
||||
let mut matcher = builder.build(text, None);
|
||||
assert_eq!(
|
||||
matcher.get_formatted_text(format_options),
|
||||
Some(
|
||||
"<em>the</em> <em>do or</em> die can't be he do and or isn'<em>t he</em>"
|
||||
.to_string()
|
||||
)
|
||||
/// "Dei store fiskane eta dei små — dei liger under som minst förmå."
|
||||
///
|
||||
/// (Men are like fish; the great ones devour the small.)
|
||||
fn rename_me_with_base_text(
|
||||
format_options: Option<FormatOptions>,
|
||||
query: &str,
|
||||
expected_text: &str,
|
||||
) {
|
||||
rename_me(
|
||||
format_options,
|
||||
"Dei store fiskane eta dei små — dei liger under som minst förmå.",
|
||||
query,
|
||||
expected_text,
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn phrase_highlight_bigger_than_crop() {
|
||||
rename_me_with_base_text(
|
||||
Some(FormatOptions { highlight: true, crop: Some(1) }),
|
||||
"\"dei liger\"",
|
||||
"…<em>dei</em>…",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn phrase_highlight_same_size_as_crop() {
|
||||
rename_me_with_base_text(
|
||||
Some(FormatOptions { highlight: true, crop: Some(2) }),
|
||||
"\"dei liger\"",
|
||||
"…<em>dei liger</em>…",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn phrase_highlight_crop_middle() {
|
||||
rename_me_with_base_text(
|
||||
Some(FormatOptions { highlight: true, crop: Some(4) }),
|
||||
"\"dei liger\"",
|
||||
"…små — <em>dei liger</em> under…",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn phrase_highlight_crop_end() {
|
||||
rename_me_with_base_text(
|
||||
Some(FormatOptions { highlight: true, crop: Some(4) }),
|
||||
"\"minst förmå\"",
|
||||
"…under som <em>minst förmå</em>.",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn phrase_highlight_crop_beginning() {
|
||||
rename_me_with_base_text(
|
||||
Some(FormatOptions { highlight: true, crop: Some(4) }),
|
||||
"\"Dei store\"",
|
||||
"<em>Dei store</em> fiskane eta…",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn highlight_end() {
|
||||
rename_me_with_base_text(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"minst förmå",
|
||||
"Dei store fiskane eta dei små — dei liger under som <em>minst</em> <em>förmå</em>.",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn highlight_beginning_and_middle() {
|
||||
rename_me_with_base_text(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"Dei store",
|
||||
"<em>Dei</em> <em>store</em> fiskane eta <em>dei</em> små — <em>dei</em> liger under som minst förmå.",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_match_middle() {
|
||||
// TODO: Is this intentional?
|
||||
// Here the only interned word is "forma", hence it cannot find the searched prefix
|
||||
// word "fo" inside "forma" within milli::search::new::matches::matching_words::MatchingWords::try_get_word_match
|
||||
// `milli::search::new::query_term::QueryTerm::all_computed_derivations` might be at fault here
|
||||
|
||||
// interned words = ["forma"]
|
||||
rename_me(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"altså, förmå, på en måte",
|
||||
"fo",
|
||||
"altså, <em>förmå</em>, på en måte",
|
||||
);
|
||||
|
||||
// interned words = ["fo", "forma"]
|
||||
rename_me(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"altså, fo förmå, på en måte",
|
||||
"fo",
|
||||
"altså, <em>fo</em> <em>fö</em>rmå, på en måte",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_match_end() {
|
||||
rename_me(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"förmå, på en måte",
|
||||
"fo",
|
||||
"<em>förmå</em>, på en måte",
|
||||
);
|
||||
|
||||
rename_me(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"fo förmå, på en måte",
|
||||
"fo",
|
||||
"<em>fo</em> <em>fö</em>rmå, på en måte",
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn partial_match_beginning() {
|
||||
rename_me(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"altså, förmå",
|
||||
"fo",
|
||||
"altså, <em>förmå</em>",
|
||||
);
|
||||
|
||||
rename_me(
|
||||
Some(FormatOptions { highlight: true, crop: None }),
|
||||
"altså, fo förmå",
|
||||
"fo",
|
||||
"altså, <em>fo</em> <em>fö</em>rmå",
|
||||
);
|
||||
}
|
||||
|
||||
// #[test]
|
||||
// fn format_identity() {
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
// let format_options = Some(FormatOptions { highlight: false, crop: None });
|
||||
|
||||
// let test_values = [
|
||||
// // Text without any match.
|
||||
// "A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
// // Text containing all matches.
|
||||
// "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
// // Text containing some matches.
|
||||
// "Natalie risk her future to build a world with the boy she loves."
|
||||
// ];
|
||||
|
||||
// for text in test_values {
|
||||
// let mut matcher = builder.build(text, None);
|
||||
// // no crop and no highlight should return complete text.
|
||||
// assert_eq!(matcher.get_formatted_text(format_options), None);
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn format_highlight() {
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
// let format_options = Some(FormatOptions { highlight: true, crop: None });
|
||||
|
||||
// let test_values = [
|
||||
// // empty text.
|
||||
// ["", ""],
|
||||
// // text containing only separators.
|
||||
// [":-)", ":-)"],
|
||||
// // Text without any match.
|
||||
// ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
// "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"],
|
||||
// // Text containing all matches.
|
||||
// ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
// "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
|
||||
// // Text containing some matches.
|
||||
// ["Natalie risk her future to build a world with the boy she loves.",
|
||||
// "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."],
|
||||
// ];
|
||||
|
||||
// for [text, expected_text] in test_values {
|
||||
// let mut matcher = builder.build(text, None);
|
||||
// // no crop should return complete text with highlighted matches.
|
||||
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn highlight_unicode() {
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let format_options = Some(FormatOptions { highlight: true, crop: None });
|
||||
|
||||
// let test_values = [
|
||||
// // Text containing prefix match.
|
||||
// ["world", "Ŵôřlḑôle", "<em>Ŵôřlḑ</em>ôle"],
|
||||
// // Text containing unicode match.
|
||||
// ["world", "Ŵôřlḑ", "<em>Ŵôřlḑ</em>"],
|
||||
// // Text containing unicode match.
|
||||
// ["westfali", "Westfália", "<em>Westfáli</em>a"],
|
||||
// ];
|
||||
|
||||
// for [query, text, expected_text] in test_values {
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
|
||||
// let mut matcher = builder.build(text, None);
|
||||
// // no crop should return complete text with highlighted matches.
|
||||
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn format_crop() {
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
// let format_options = Some(FormatOptions { highlight: false, crop: Some(10) });
|
||||
|
||||
// let test_values = [
|
||||
// // empty text.
|
||||
// // ["", ""],
|
||||
// // text containing only separators.
|
||||
// // [":-)", ":-)"],
|
||||
// // Text without any match.
|
||||
// ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
// "A quick brown fox can not jump 32 feet, right…"],
|
||||
// // Text without any match starting by a separator.
|
||||
// ["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)",
|
||||
// "(A quick brown fox can not jump 32 feet, right…" ],
|
||||
// // Test phrase propagation
|
||||
// ["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.",
|
||||
// "…Split The World is a book written by Emily Henry…"],
|
||||
// // Text containing some matches.
|
||||
// ["Natalie risk her future to build a world with the boy she loves.",
|
||||
// "…future to build a world with the boy she loves."],
|
||||
// // Text containing all matches.
|
||||
// ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
// "…she loves. Emily Henry: The Love That Split The World."],
|
||||
// // Text containing a match unordered and a match ordered.
|
||||
// ["The world split void void void void void void void void void split the world void void",
|
||||
// "…void void void void void split the world void void"],
|
||||
// // Text containing matches with different density.
|
||||
// ["split void the void void world void void void void void void void void void void split the world void void",
|
||||
// "…void void void void void split the world void void"],
|
||||
// ["split split split split split split void void void void void void void void void void split the world void void",
|
||||
// "…void void void void void split the world void void"]
|
||||
// ];
|
||||
|
||||
// for [text, expected_text] in test_values {
|
||||
// let mut matcher = builder.build(text, None);
|
||||
// // no crop should return complete text with highlighted matches.
|
||||
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn format_highlight_crop() {
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
// let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
|
||||
|
||||
// let test_values = [
|
||||
// // empty text.
|
||||
// ["", ""],
|
||||
// // text containing only separators.
|
||||
// [":-)", ":-)"],
|
||||
// // Text without any match.
|
||||
// ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
|
||||
// "A quick brown fox can not jump 32 feet, right…"],
|
||||
// // Text containing some matches.
|
||||
// ["Natalie risk her future to build a world with the boy she loves.",
|
||||
// "…future to build a <em>world</em> with <em>the</em> boy she loves."],
|
||||
// // Text containing all matches.
|
||||
// ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
|
||||
// "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
|
||||
// // Text containing a match unordered and a match ordered.
|
||||
// ["The world split void void void void void void void void void split the world void void",
|
||||
// "…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"]
|
||||
// ];
|
||||
|
||||
// for [text, expected_text] in test_values {
|
||||
// let mut matcher = builder.build(text, None);
|
||||
// // no crop should return complete text with highlighted matches.
|
||||
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn format_highlight_crop_phrase_query() {
|
||||
// //! testing: https://github.com/meilisearch/meilisearch/issues/3975
|
||||
// let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
|
||||
// let temp_index = temp_index_with_documents(Some(documents!([
|
||||
// { "id": 1, "text": text }
|
||||
// ])));
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
|
||||
// let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
|
||||
|
||||
// let test_values = [
|
||||
// // should return 10 words with a marker at the start as well the end, and the highlighted matches.
|
||||
// ["\"the world\"",
|
||||
// "…the power to split <em>the world</em> between those who embraced…"],
|
||||
// // should highlight "those" and the phrase "and those".
|
||||
// ["those \"and those\"",
|
||||
// "…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"],
|
||||
// ["\"The groundbreaking invention had the power to split the world\"",
|
||||
// "<em>The groundbreaking invention had the power to split the world</em>…"],
|
||||
// ["\"The groundbreaking invention had the power to split the world between those\"",
|
||||
// "<em>The groundbreaking invention had the power to split the world</em>…"],
|
||||
// ["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
|
||||
// "…between those who <em>embraced progress and those who resisted change</em>!"],
|
||||
// ["\"groundbreaking invention\" \"split the world between\"",
|
||||
// "…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"],
|
||||
// ["\"groundbreaking invention\" \"had the power to split the world between those\"",
|
||||
// "…<em>invention</em> <em>had the power to split the world between those</em>…"],
|
||||
// ];
|
||||
|
||||
// for [query, expected_text] in test_values {
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
|
||||
// let mut matcher = builder.build(text, None);
|
||||
|
||||
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn smaller_crop_size() {
|
||||
// //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
|
||||
// let text = "void void split the world void void.";
|
||||
// let mut matcher = builder.build(text, None);
|
||||
|
||||
// let test_values = [
|
||||
// // set a smaller crop size
|
||||
// // because crop size < query size, partially format matches.
|
||||
// (2, "…split the…"),
|
||||
// // set a smaller crop size
|
||||
// // because crop size < query size, partially format matches.
|
||||
// (1, "…split…"),
|
||||
// // set crop size to 0
|
||||
// // because crop size is 0, crop is ignored.
|
||||
// (0, "void void split the world void void."),
|
||||
// ];
|
||||
|
||||
// for (crop_size, expected_text) in test_values {
|
||||
// // set a smaller crop size
|
||||
// let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) });
|
||||
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
|
||||
// }
|
||||
// }
|
||||
|
||||
// #[test]
|
||||
// fn partial_matches() {
|
||||
// let temp_index = temp_index_with_documents(None);
|
||||
// let rtxn = temp_index.read_txn().unwrap();
|
||||
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
|
||||
|
||||
// let format_options = Some(FormatOptions { highlight: true, crop: None });
|
||||
|
||||
// let text = "the do or die can't be he do and or isn't he";
|
||||
// let mut matcher = builder.build(text, None);
|
||||
// assert_eq!(
|
||||
// matcher.get_formatted_text(format_options),
|
||||
// Some(
|
||||
// "<em>the</em> <em>do or</em> die can't be he do and or isn'<em>t he</em>"
|
||||
// .to_string()
|
||||
// )
|
||||
// );
|
||||
// }
|
||||
}
|
||||
|
|
|
@ -489,8 +489,7 @@ impl QueryTerm {
|
|||
let mut words = BTreeSet::new();
|
||||
let mut phrases = BTreeSet::new();
|
||||
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
|
||||
&self.zero_typo;
|
||||
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, .. } = &self.zero_typo;
|
||||
words.extend(zero_typo.iter().copied());
|
||||
words.extend(prefix_of.iter().copied());
|
||||
phrases.extend(phrase.iter().copied());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue