Refactor, fix, misc

This commit is contained in:
F. Levi 2025-06-15 14:18:18 +03:00
parent 24f213c343
commit 561b4836d8
9 changed files with 689 additions and 569 deletions

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -1,152 +1,153 @@
use std::cmp::Ordering;
use charabia::{SeparatorKind, Token, TokenKind};
use charabia::{SeparatorKind, Token};
enum SimpleTokenKind {
Separator(SeparatorKind),
NonSeparator,
Done,
#[derive(Clone)]
enum Direction {
Forwards,
Backwards,
}
impl SimpleTokenKind {
fn new(token: &Token) -> Self {
match token.kind {
TokenKind::Separator(separator_kind) => Self::Separator(separator_kind),
_ => Self::NonSeparator,
}
}
}
struct CropBoundsHelper<'a> {
tokens: &'a [Token<'a>],
index_backward: usize,
backward_token_kind: SimpleTokenKind,
index_forward: usize,
forward_token_kind: SimpleTokenKind,
}
impl CropBoundsHelper<'_> {
fn advance_backward(&mut self) {
if matches!(self.backward_token_kind, SimpleTokenKind::Done) {
return;
}
if self.index_backward != 0 {
self.index_backward -= 1;
self.backward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_backward]);
} else {
self.backward_token_kind = SimpleTokenKind::Done;
}
}
fn advance_forward(&mut self) {
if matches!(self.forward_token_kind, SimpleTokenKind::Done) {
return;
}
if self.index_forward != self.tokens.len() - 1 {
self.index_forward += 1;
self.forward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_forward]);
} else {
self.forward_token_kind = SimpleTokenKind::Done;
impl Direction {
fn switch(&mut self) {
*self = match self {
Direction::Backwards => Direction::Forwards,
Direction::Forwards => Direction::Backwards,
}
}
}
fn get_adjusted_indices_for_too_few_words(
tokens: &[Token],
index_backward: usize,
index_forward: usize,
mut index_backward: usize,
mut index_forward: usize,
mut words_count: usize,
crop_size: usize,
) -> [usize; 2] {
let crop_size = crop_size + 2;
let mut cbh = CropBoundsHelper {
tokens,
index_backward,
backward_token_kind: SimpleTokenKind::new(&tokens[index_backward]),
index_forward,
forward_token_kind: SimpleTokenKind::new(&tokens[index_forward]),
};
let mut valid_index_backward = index_backward;
let mut valid_index_forward = index_forward;
let mut is_end_reached = index_forward == tokens.len() - 1;
let mut is_beginning_reached = index_backward == 0;
let mut is_index_backwards_at_hard_separator = false;
let mut is_index_forwards_at_hard_separator = false;
// false + ends reached because TODO
let mut is_crop_size_or_both_ends_reached = is_end_reached && is_beginning_reached;
let mut dir = Direction::Forwards;
loop {
match [&cbh.backward_token_kind, &cbh.forward_token_kind] {
// if they are both separators and are the same kind then advance both,
// or expand in the soft separator side
[SimpleTokenKind::Separator(backward_sk), SimpleTokenKind::Separator(forward_sk)] => {
if backward_sk == forward_sk {
cbh.advance_backward();
if is_crop_size_or_both_ends_reached {
break;
}
// this avoids having an ending separator before crop marker
if words_count < crop_size - 1 {
cbh.advance_forward();
let (index, valid_index) = match dir {
Direction::Backwards => (&mut index_backward, &mut valid_index_backward),
Direction::Forwards => (&mut index_forward, &mut valid_index_forward),
};
loop {
match dir {
Direction::Forwards => {
if is_end_reached {
break;
}
} else if matches!(backward_sk, SeparatorKind::Hard) {
cbh.advance_forward();
} else {
cbh.advance_backward();
*index += 1;
is_end_reached = *index == tokens.len() - 1;
}
Direction::Backwards => {
if is_beginning_reached
|| (!is_end_reached
&& is_index_backwards_at_hard_separator
&& !is_index_forwards_at_hard_separator)
{
break;
}
*index -= 1;
is_beginning_reached = *index == 0;
}
};
if is_end_reached && is_beginning_reached {
is_crop_size_or_both_ends_reached = true;
}
// both are words, advance left then right if we haven't reached `crop_size`
[SimpleTokenKind::NonSeparator, SimpleTokenKind::NonSeparator] => {
cbh.advance_backward();
let maybe_is_token_hard_separator = tokens[*index]
.separator_kind()
.map(|sep_kind| matches!(sep_kind, SeparatorKind::Hard));
// it's not a separator
if maybe_is_token_hard_separator.is_none() {
*valid_index = *index;
words_count += 1;
if words_count != crop_size {
cbh.advance_forward();
words_count += 1;
if words_count == crop_size {
is_crop_size_or_both_ends_reached = true;
}
break;
}
[SimpleTokenKind::Done, SimpleTokenKind::Done] => break,
// if one of the tokens is non-separator and the other a separator, we expand in the non-separator side
// if one of the sides reached the end, we expand in the opposite direction
[backward_stk, SimpleTokenKind::Done]
| [backward_stk @ SimpleTokenKind::NonSeparator, SimpleTokenKind::Separator(_)] => {
if matches!(backward_stk, SimpleTokenKind::NonSeparator) {
words_count += 1;
}
cbh.advance_backward();
}
[SimpleTokenKind::Done, forward_stk]
| [SimpleTokenKind::Separator(_), forward_stk @ SimpleTokenKind::NonSeparator] => {
if matches!(forward_stk, SimpleTokenKind::NonSeparator) {
words_count += 1;
}
cbh.advance_forward();
}
let is_index_at_hard_separator = match dir {
Direction::Backwards => &mut is_index_backwards_at_hard_separator,
Direction::Forwards => &mut is_index_forwards_at_hard_separator,
};
*is_index_at_hard_separator =
maybe_is_token_hard_separator.is_some_and(|is_hard| is_hard);
}
dir.switch();
// 1. if end is reached, we can only advance backwards
// 2. if forwards index reached a hard separator and backwards is currently hard, we can go backwards
}
// keep advancing forward to check if there's only separator tokens left until the end
// if so, then include those too in the index range
let mut try_index_forward = valid_index_forward + 1;
while let Some(token) = tokens.get(try_index_forward) {
if !token.is_separator() {
return [valid_index_backward, valid_index_forward];
}
try_index_forward += 1;
}
[valid_index_backward, try_index_forward - 1]
}
fn get_adjusted_index_forward_for_too_many_words(
tokens: &[Token],
index_backward: usize,
mut index_forward: usize,
mut words_count: usize,
crop_size: usize,
) -> usize {
loop {
if index_forward == index_backward {
return index_forward;
}
index_forward -= 1;
if tokens[index_forward].is_separator() {
continue;
}
words_count -= 1;
if words_count == crop_size {
break;
}
}
[cbh.index_backward, cbh.index_forward]
}
fn get_adjusted_index_forward_for_too_many_words(
tokens: &[Token],
mut index_forward: usize,
mut words_count: usize,
crop_size: usize,
) -> usize {
while index_forward != 0 {
if matches!(SimpleTokenKind::new(&tokens[index_forward]), SimpleTokenKind::NonSeparator) {
words_count -= 1;
if words_count == crop_size {
break;
}
}
index_forward -= 1;
}
if index_forward == 0 {
return index_forward;
}
index_forward - 1
index_forward
}
pub fn get_adjusted_indices_for_highlights_and_crop_size(
@ -164,14 +165,12 @@ pub fn get_adjusted_indices_for_highlights_and_crop_size(
words_count,
crop_size,
),
Ordering::Equal => [
if index_backward != 0 { index_backward - 1 } else { index_backward },
if index_forward != tokens.len() - 1 { index_forward + 1 } else { index_forward },
],
Ordering::Equal => [index_backward, index_forward],
Ordering::Greater => [
index_backward,
get_adjusted_index_forward_for_too_many_words(
tokens,
index_backward,
index_forward,
words_count,
crop_size,
@ -185,7 +184,7 @@ pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usi
let mut index = 0;
while index != tokens.len() - 1 {
if matches!(SimpleTokenKind::new(&tokens[index]), SimpleTokenKind::NonSeparator) {
if !tokens[index].is_separator() {
words_count += 1;
if words_count == crop_size {

View file

@ -14,6 +14,7 @@ use utoipa::ToSchema;
use super::FormatOptions;
// TODO: Differentiate if full match do not return None, instead return match bounds with full length
#[derive(Serialize, ToSchema)]
#[serde(rename_all = "camelCase")]
pub struct MatchBounds {
@ -158,20 +159,28 @@ impl MatchBoundsHelper<'_> {
}
/// For crop but no highlight.
fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> Option<MatchBounds> {
fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> MatchBounds {
let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size);
let final_token = &self.tokens[final_token_index];
if final_token_index == self.tokens.len() - 1 {
return None;
}
// TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start?
Some(MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] })
// TODO: Can here be an error, because it's byte_start but it could be byte_end?
MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] }
}
fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices {
let asd = |i1, i2| {
println!(
"{}|{}|{}\n{} {}",
self.tokens[..i1].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
self.tokens[i1..i2].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
self.tokens[i2..].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
i1,
i2
);
};
// TODO: This doesn't give back 2 phrases if one is out of crop window
// Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them
let [matches_first_index, matches_last_index] =
@ -196,28 +205,17 @@ impl MatchBoundsHelper<'_> {
crop_size,
);
let is_index_backward_at_limit = index_backward == 0;
let is_index_forward_at_limit = index_forward == self.tokens.len() - 1;
asd(first_match.get_first_token_pos(), last_match.get_last_token_pos());
asd(index_backward, index_forward);
let backward_token = &self.tokens[index_backward];
let crop_byte_start = if is_index_backward_at_limit {
backward_token.byte_start
} else {
backward_token.byte_end
};
let forward_token = &self.tokens[index_forward];
let crop_byte_end = if is_index_forward_at_limit {
forward_token.byte_end
} else {
forward_token.byte_start
};
MatchesAndCropIndices {
matches_first_index,
matches_last_index,
crop_byte_start,
crop_byte_end,
crop_byte_start: backward_token.byte_start,
crop_byte_end: forward_token.byte_end,
}
}
@ -248,7 +246,7 @@ impl MatchBounds {
if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) {
if matches.is_empty() {
return mbh.get_crop_bounds_with_no_matches(crop_size);
return Some(mbh.get_crop_bounds_with_no_matches(crop_size));
}
if format_options.highlight {
@ -258,15 +256,15 @@ impl MatchBounds {
return Some(mbh.get_crop_bounds_with_matches(crop_size));
}
if format_options.highlight && !matches.is_empty() {
Some(mbh.get_match_bounds(MatchesAndCropIndices {
matches_first_index: 0,
matches_last_index: matches.len() - 1,
crop_byte_start: 0,
crop_byte_end: tokens[tokens.len() - 1].byte_end,
}))
} else {
None
if !format_options.highlight || matches.is_empty() {
return None;
}
Some(mbh.get_match_bounds(MatchesAndCropIndices {
matches_first_index: 0,
matches_last_index: matches.len() - 1,
crop_byte_start: 0,
crop_byte_end: tokens[tokens.len() - 1].byte_end,
}))
}
}

View file

@ -115,17 +115,21 @@ impl MatchingWords {
let position = [*positions.start(), *positions.end()];
located_matching_phrases.reserve(matching_phrases.len());
located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
LocatedMatchingPhrase { value: *matching_phrase, position }
}));
if !matching_phrases.is_empty() {
located_matching_phrases.reserve(matching_phrases.len());
located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
LocatedMatchingPhrase { value: *matching_phrase, position }
}));
}
located_matching_words.push(LocatedMatchingWords {
value: matching_words,
position,
is_prefix: term.is_prefix(),
original_char_count: term.original_word(&ctx).chars().count(),
});
if !matching_words.is_empty() {
located_matching_words.push(LocatedMatchingWords {
value: matching_words,
position,
is_prefix: term.is_prefix(),
original_char_count: term.original_word(&ctx).chars().count(),
});
}
}
// Sort words by having `is_prefix` as false first and then by their lengths in reverse order.
@ -147,12 +151,11 @@ impl MatchingWords {
token_position_helper_iter: &mut (impl Iterator<Item = TokenPositionHelper<'a>> + Clone),
) -> Option<(Match, UserQueryPositionRange)> {
let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| {
let words_iter = self
.phrase_interner
.get(lmp.value)
.words
let words = &self.phrase_interner.get(lmp.value).words;
let words_iter = words
.iter()
.map(|word_option| word_option.map(|word| self.word_interner.get(word).as_str()))
.map(|maybe_word| maybe_word.map(|word| self.word_interner.get(word).as_str()))
.peekable();
(lmp.position, words_iter)
@ -161,7 +164,7 @@ impl MatchingWords {
'outer: loop {
let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?;
// TODO: Is it worth only cloning if we have to?
// TODO: if it's worth it, clone only if we have to
let mut tph_iter = token_position_helper_iter.clone();
let mut first_tph_details = None;
@ -241,46 +244,50 @@ impl MatchingWords {
tph: TokenPositionHelper,
text: &str,
) -> Option<(Match, UserQueryPositionRange)> {
let mut iter =
self.located_matching_words.iter().flat_map(|lw| lw.value.iter().map(move |w| (lw, w)));
// TODO: There is potentially an optimization to be made here
// if we matched a term then we can skip checking it for further iterations?
loop {
let (located_words, word) = iter.next()?;
let word = self.word_interner.get(*word);
self.located_matching_words
.iter()
.flat_map(|lw| lw.value.iter().map(move |w| (lw, w)))
.find_map(|(located_words, word)| {
let word = self.word_interner.get(*word);
let [char_count, byte_len] =
match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) {
PrefixedOrEquality::Prefixed => {
let prefix_byte_len = text[tph.token.byte_start..]
.char_indices()
.nth(located_words.original_char_count - 1)
.map(|(i, c)| i + c.len_utf8())
.expect("expected text to have n-th thing bal bla TODO");
let [char_count, byte_len] =
match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix)
{
PrefixedOrEquality::Prefixed => {
let prefix_byte_len = text[tph.token.byte_start..]
.char_indices()
.nth(located_words.original_char_count - 1)
.map(|(i, c)| i + c.len_utf8())
.expect("expected text to have n-th thing bal bla TODO");
// TODO: Investigate token original byte length and similar methods and why they're not good enough
// TODO: Investigate token original byte length and similar methods and why they're not good enough
// That might be because token original byte length only or could also refer to the normalized byte length
[located_words.original_char_count, prefix_byte_len]
}
// do not +1, because Token index ranges are exclusive
PrefixedOrEquality::Equality => [
tph.token.char_end - tph.token.char_start,
tph.token.byte_end - tph.token.byte_start,
],
_ => continue,
};
[located_words.original_char_count, prefix_byte_len]
}
// do not +1, because Token index ranges are exclusive
PrefixedOrEquality::Equality => [
tph.token.char_end - tph.token.char_start,
tph.token.byte_end - tph.token.byte_start,
],
_ => return None,
};
return Some((
Match {
char_count,
byte_len,
position: MatchPosition::Word {
word_position: tph.position_by_word,
token_position: tph.position_by_token,
Some((
Match {
char_count,
byte_len,
position: MatchPosition::Word {
word_position: tph.position_by_word,
token_position: tph.position_by_token,
},
},
},
located_words.position,
));
}
located_words.position,
))
})
}
pub fn get_matches_and_query_positions(
@ -361,93 +368,93 @@ impl Debug for MatchingWords {
}
}
#[cfg(test)]
pub(crate) mod tests {
use super::super::super::located_query_terms_from_tokens;
use super::*;
use crate::search::new::matches::tests::temp_index_with_documents;
use crate::search::new::query_term::ExtractedTokens;
use charabia::{TokenKind, TokenizerBuilder};
use std::borrow::Cow;
// #[cfg(test)]
// pub(crate) mod tests {
// use super::super::super::located_query_terms_from_tokens;
// use super::*;
// use crate::search::new::matches::tests::temp_index_with_documents;
// use crate::search::new::query_term::ExtractedTokens;
// use charabia::{TokenKind, TokenizerBuilder};
// use std::borrow::Cow;
#[test]
fn matching_words() {
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
let mut builder = TokenizerBuilder::default();
let tokenizer = builder.build();
let text = "split this world";
let tokens = tokenizer.tokenize(text);
let ExtractedTokens { query_terms, .. } =
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
let matching_words = MatchingWords::new(ctx, &query_terms);
// #[test]
// fn matching_words() {
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
// let mut builder = TokenizerBuilder::default();
// let tokenizer = builder.build();
// let text = "split this world";
// let tokens = tokenizer.tokenize(text);
// let ExtractedTokens { query_terms, .. } =
// located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
// let matching_words = MatchingWords::new(ctx, &query_terms);
assert_eq!(
matching_words.get_matches_and_query_positions(
&[
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("split"),
char_end: "split".chars().count(),
byte_end: "split".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("nyc"),
char_end: "nyc".chars().count(),
byte_end: "nyc".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("world"),
char_end: "world".chars().count(),
byte_end: "world".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("worlded"),
char_end: "worlded".chars().count(),
byte_end: "worlded".len(),
..Default::default()
},
Token {
kind: TokenKind::Word,
lemma: Cow::Borrowed("thisnew"),
char_end: "thisnew".chars().count(),
byte_end: "thisnew".len(),
..Default::default()
}
],
text
),
(
vec![
Match {
char_count: 5,
byte_len: 5,
position: MatchPosition::Word { word_position: 0, token_position: 0 }
},
Match {
char_count: 5,
byte_len: 5,
position: MatchPosition::Word { word_position: 2, token_position: 2 }
},
Match {
char_count: 5,
byte_len: 5,
position: MatchPosition::Word { word_position: 3, token_position: 3 }
}
],
vec![
QueryPosition { range: [0, 0], index: 0 },
QueryPosition { range: [2, 2], index: 1 },
QueryPosition { range: [2, 2], index: 2 }
]
)
);
}
}
// assert_eq!(
// matching_words.get_matches_and_query_positions(
// &[
// Token {
// kind: TokenKind::Word,
// lemma: Cow::Borrowed("split"),
// char_end: "split".chars().count(),
// byte_end: "split".len(),
// ..Default::default()
// },
// Token {
// kind: TokenKind::Word,
// lemma: Cow::Borrowed("nyc"),
// char_end: "nyc".chars().count(),
// byte_end: "nyc".len(),
// ..Default::default()
// },
// Token {
// kind: TokenKind::Word,
// lemma: Cow::Borrowed("world"),
// char_end: "world".chars().count(),
// byte_end: "world".len(),
// ..Default::default()
// },
// Token {
// kind: TokenKind::Word,
// lemma: Cow::Borrowed("worlded"),
// char_end: "worlded".chars().count(),
// byte_end: "worlded".len(),
// ..Default::default()
// },
// Token {
// kind: TokenKind::Word,
// lemma: Cow::Borrowed("thisnew"),
// char_end: "thisnew".chars().count(),
// byte_end: "thisnew".len(),
// ..Default::default()
// }
// ],
// text
// ),
// (
// vec![
// Match {
// char_count: 5,
// byte_len: 5,
// position: MatchPosition::Word { word_position: 0, token_position: 0 }
// },
// Match {
// char_count: 5,
// byte_len: 5,
// position: MatchPosition::Word { word_position: 2, token_position: 2 }
// },
// Match {
// char_count: 5,
// byte_len: 5,
// position: MatchPosition::Word { word_position: 3, token_position: 3 }
// }
// ],
// vec![
// QueryPosition { range: [0, 0], index: 0 },
// QueryPosition { range: [2, 2], index: 1 },
// QueryPosition { range: [2, 2], index: 2 }
// ]
// )
// );
// }
// }

View file

@ -48,10 +48,9 @@ impl<'a> MatcherBuilder<'a> {
}
}
#[derive(Copy, Clone, Default, Debug)]
#[derive(Copy, Clone, Default)]
pub struct FormatOptions {
pub highlight: bool,
// TODO: Should this be usize?
pub crop: Option<usize>,
}
@ -80,7 +79,9 @@ impl Matcher<'_, '_, '_, '_> {
/// TODO: description
pub fn get_match_bounds(
&mut self,
// TODO: Add option to count grapheme clusters instead of bytes
// TODO: Add option to count UTF-16 segments, or whatever JS works with when slicing strings
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#utf-16_characters_unicode_code_points_and_grapheme_clusters
// https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice
format_options: Option<FormatOptions>,
) -> Option<MatchBounds> {
if self.text.is_empty() {
@ -152,7 +153,6 @@ mod tests {
use crate::index::tests::TempIndex;
use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
use charabia::TokenizerBuilder;
use memmap2::Mmap;
impl<'a> MatcherBuilder<'a> {
fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self {
@ -180,10 +180,9 @@ mod tests {
.unwrap();
// consume context and located_query_terms to build MatchingWords.
let matching_words = match located_query_terms {
Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms),
None => MatchingWords::default(),
};
let matching_words = located_query_terms
.map(|located_query_terms| MatchingWords::new(ctx, &located_query_terms))
.unwrap_or_default();
MatcherBuilder::new(
matching_words,
@ -197,283 +196,401 @@ mod tests {
}
}
pub fn temp_index_with_documents(documents: Option<Mmap>) -> TempIndex {
pub fn rename_me(
format_options: Option<FormatOptions>,
text: &str,
query: &str,
expected_text: &str,
) {
let temp_index = TempIndex::new();
// document will always contain the same exact text normally
// TODO: Describe this better and ask if this is actually the case
temp_index
.add_documents(documents.unwrap_or_else(|| {
documents!([
{ "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
{ "id": 2, "name": "Westfália" },
{ "id": 3, "name": "Ŵôřlḑôle" },
])
}))
.add_documents(documents!([
{ "id": 1, "text": text.to_string() },
]))
.unwrap();
temp_index
}
fn get_expected_maybe_text(expected_text: &str, text: &str) -> Option<String> {
if expected_text == text {
None
} else {
Some(expected_text.to_string())
}
}
#[test]
fn format_identity() {
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = Some(FormatOptions { highlight: false, crop: None });
let test_values = [
// Text without any match.
"A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
// Text containing all matches.
"Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
// Text containing some matches.
"Natalie risk her future to build a world with the boy she loves."
];
for text in test_values {
let mut matcher = builder.build(text, None);
// no crop and no highlight should return complete text.
assert_eq!(matcher.get_formatted_text(format_options), None);
}
}
#[test]
fn format_highlight() {
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = Some(FormatOptions { highlight: true, crop: None });
let test_values = [
// empty text.
["", ""],
// text containing only separators.
[":-)", ":-)"],
// Text without any match.
["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
"A quick brown fox can not jump 32 feet, right? Brr, it is cold!"],
// Text containing all matches.
["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
// Text containing some matches.
["Natalie risk her future to build a world with the boy she loves.",
"Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."],
];
for [text, expected_text] in test_values {
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
assert_eq!(
matcher.get_formatted_text(format_options),
get_expected_maybe_text(expected_text, text)
);
}
}
#[test]
fn highlight_unicode() {
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let format_options = Some(FormatOptions { highlight: true, crop: None });
let test_values = [
// Text containing prefix match.
["world", "Ŵôřlḑôle", "<em>Ŵôřlḑ</em>ôle"],
// Text containing unicode match.
["world", "Ŵôřlḑ", "<em>Ŵôřlḑ</em>"],
// Text containing unicode match.
["westfali", "Westfália", "<em>Westfáli</em>a"],
];
for [query, text, expected_text] in test_values {
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
assert_eq!(
matcher.get_formatted_text(format_options),
get_expected_maybe_text(expected_text, text)
);
}
}
#[test]
fn format_crop() {
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = Some(FormatOptions { highlight: false, crop: Some(10) });
let test_values = [
// empty text.
["", ""],
// text containing only separators.
[":-)", ":-)"],
// Text without any match.
["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
"A quick brown fox can not jump 32 feet, right…"],
// Text without any match starting by a separator.
["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)",
"(A quick brown fox can not jump 32 feet, right…" ],
// Test phrase propagation
["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.",
"…Split The World is a book written by Emily Henry…"],
// Text containing some matches.
["Natalie risk her future to build a world with the boy she loves.",
"…future to build a world with the boy she loves."],
// Text containing all matches.
["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
"…she loves. Emily Henry: The Love That Split The World."],
// Text containing a match unordered and a match ordered.
["The world split void void void void void void void void void split the world void void",
"…void void void void void split the world void void"],
// Text containing matches with different density.
["split void the void void world void void void void void void void void void void split the world void void",
"…void void void void void split the world void void"],
["split split split split split split void void void void void void void void void void split the world void void",
"…void void void void void split the world void void"]
];
for [text, expected_text] in test_values {
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
assert_eq!(
matcher.get_formatted_text(format_options),
get_expected_maybe_text(expected_text, text)
);
}
}
#[test]
fn format_highlight_crop() {
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
let test_values = [
// empty text.
["", ""],
// text containing only separators.
[":-)", ":-)"],
// Text without any match.
["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
"A quick brown fox can not jump 32 feet, right…"],
// Text containing some matches.
["Natalie risk her future to build a world with the boy she loves.",
"…future to build a <em>world</em> with <em>the</em> boy she loves."],
// Text containing all matches.
["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
"…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
// Text containing a match unordered and a match ordered.
["The world split void void void void void void void void void split the world void void",
"…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"]
];
for [text, expected_text] in test_values {
let mut matcher = builder.build(text, None);
// no crop should return complete text with highlighted matches.
assert_eq!(
matcher.get_formatted_text(format_options),
get_expected_maybe_text(expected_text, text)
);
}
}
#[test]
fn format_highlight_crop_phrase_query() {
//! testing: https://github.com/meilisearch/meilisearch/issues/3975
let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
let temp_index = temp_index_with_documents(Some(documents!([
{ "id": 1, "text": text }
])));
let rtxn = temp_index.read_txn().unwrap();
let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
let test_values = [
// should return 10 words with a marker at the start as well the end, and the highlighted matches.
["\"the world\"",
"…the power to split <em>the world</em> between those who embraced…"],
// should highlight "those" and the phrase "and those".
["those \"and those\"",
"…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"],
["\"The groundbreaking invention had the power to split the world\"",
"<em>The groundbreaking invention had the power to split the world</em>…"],
["\"The groundbreaking invention had the power to split the world between those\"",
"<em>The groundbreaking invention had the power to split the world</em>…"],
["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
"…between those who <em>embraced progress and those who resisted change</em>!"],
["\"groundbreaking invention\" \"split the world between\"",
"…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"],
["\"groundbreaking invention\" \"had the power to split the world between those\"",
"…<em>invention</em> <em>had the power to split the world between those</em>…"],
];
for [query, expected_text] in test_values {
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
let mut matcher = builder.build(text, None);
assert_eq!(
matcher.get_formatted_text(format_options),
get_expected_maybe_text(expected_text, text)
);
}
}
#[test]
fn smaller_crop_size() {
//! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
let text = "void void split the world void void.";
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
let mut matcher = builder.build(text, None);
let test_values = [
// set a smaller crop size
// because crop size < query size, partially format matches.
(2, "…split the…"),
// set a smaller crop size
// because crop size < query size, partially format matches.
(1, "…split…"),
// set crop size to 0
// because crop size is 0, crop is ignored.
(0, "void void split the world void void."),
];
for (crop_size, expected_text) in test_values {
// set a smaller crop size
let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) });
assert_eq!(
matcher.get_formatted_text(format_options),
get_expected_maybe_text(expected_text, text)
);
}
assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
}
#[test]
fn partial_matches() {
let temp_index = temp_index_with_documents(None);
let rtxn = temp_index.read_txn().unwrap();
let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
let format_options = Some(FormatOptions { highlight: true, crop: None });
let text = "the do or die can't be he do and or isn't he";
let mut matcher = builder.build(text, None);
assert_eq!(
matcher.get_formatted_text(format_options),
Some(
"<em>the</em> <em>do or</em> die can't be he do and or isn'<em>t he</em>"
.to_string()
)
/// "Dei store fiskane eta dei små — dei liger under som minst förmå."
///
/// (Men are like fish; the great ones devour the small.)
fn rename_me_with_base_text(
format_options: Option<FormatOptions>,
query: &str,
expected_text: &str,
) {
rename_me(
format_options,
"Dei store fiskane eta dei små — dei liger under som minst förmå.",
query,
expected_text,
);
}
#[test]
fn phrase_highlight_bigger_than_crop() {
rename_me_with_base_text(
Some(FormatOptions { highlight: true, crop: Some(1) }),
"\"dei liger\"",
"…<em>dei</em>…",
);
}
#[test]
fn phrase_highlight_same_size_as_crop() {
rename_me_with_base_text(
Some(FormatOptions { highlight: true, crop: Some(2) }),
"\"dei liger\"",
"…<em>dei liger</em>…",
);
}
#[test]
fn phrase_highlight_crop_middle() {
rename_me_with_base_text(
Some(FormatOptions { highlight: true, crop: Some(4) }),
"\"dei liger\"",
"…små — <em>dei liger</em> under…",
);
}
#[test]
fn phrase_highlight_crop_end() {
rename_me_with_base_text(
Some(FormatOptions { highlight: true, crop: Some(4) }),
"\"minst förmå\"",
"…under som <em>minst förmå</em>.",
);
}
#[test]
fn phrase_highlight_crop_beginning() {
rename_me_with_base_text(
Some(FormatOptions { highlight: true, crop: Some(4) }),
"\"Dei store\"",
"<em>Dei store</em> fiskane eta…",
);
}
#[test]
fn highlight_end() {
rename_me_with_base_text(
Some(FormatOptions { highlight: true, crop: None }),
"minst förmå",
"Dei store fiskane eta dei små — dei liger under som <em>minst</em> <em>förmå</em>.",
);
}
#[test]
fn highlight_beginning_and_middle() {
rename_me_with_base_text(
Some(FormatOptions { highlight: true, crop: None }),
"Dei store",
"<em>Dei</em> <em>store</em> fiskane eta <em>dei</em> små — <em>dei</em> liger under som minst förmå.",
);
}
#[test]
fn partial_match_middle() {
// TODO: Is this intentional?
// Here the only interned word is "forma", hence it cannot find the searched prefix
// word "fo" inside "forma" within milli::search::new::matches::matching_words::MatchingWords::try_get_word_match
// `milli::search::new::query_term::QueryTerm::all_computed_derivations` might be at fault here
// interned words = ["forma"]
rename_me(
Some(FormatOptions { highlight: true, crop: None }),
"altså, förmå, på en måte",
"fo",
"altså, <em>förmå</em>, på en måte",
);
// interned words = ["fo", "forma"]
rename_me(
Some(FormatOptions { highlight: true, crop: None }),
"altså, fo förmå, på en måte",
"fo",
"altså, <em>fo</em> <em>fö</em>rmå, på en måte",
);
}
#[test]
fn partial_match_end() {
rename_me(
Some(FormatOptions { highlight: true, crop: None }),
"förmå, på en måte",
"fo",
"<em>förmå</em>, på en måte",
);
rename_me(
Some(FormatOptions { highlight: true, crop: None }),
"fo förmå, på en måte",
"fo",
"<em>fo</em> <em>fö</em>rmå, på en måte",
);
}
#[test]
fn partial_match_beginning() {
rename_me(
Some(FormatOptions { highlight: true, crop: None }),
"altså, förmå",
"fo",
"altså, <em>förmå</em>",
);
rename_me(
Some(FormatOptions { highlight: true, crop: None }),
"altså, fo förmå",
"fo",
"altså, <em>fo</em> <em>fö</em>rmå",
);
}
// #[test]
// fn format_identity() {
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
// let format_options = Some(FormatOptions { highlight: false, crop: None });
// let test_values = [
// // Text without any match.
// "A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
// // Text containing all matches.
// "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
// // Text containing some matches.
// "Natalie risk her future to build a world with the boy she loves."
// ];
// for text in test_values {
// let mut matcher = builder.build(text, None);
// // no crop and no highlight should return complete text.
// assert_eq!(matcher.get_formatted_text(format_options), None);
// }
// }
// #[test]
// fn format_highlight() {
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
// let format_options = Some(FormatOptions { highlight: true, crop: None });
// let test_values = [
// // empty text.
// ["", ""],
// // text containing only separators.
// [":-)", ":-)"],
// // Text without any match.
// ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
// "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"],
// // Text containing all matches.
// ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
// "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
// // Text containing some matches.
// ["Natalie risk her future to build a world with the boy she loves.",
// "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."],
// ];
// for [text, expected_text] in test_values {
// let mut matcher = builder.build(text, None);
// // no crop should return complete text with highlighted matches.
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
// }
// }
// #[test]
// fn highlight_unicode() {
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let format_options = Some(FormatOptions { highlight: true, crop: None });
// let test_values = [
// // Text containing prefix match.
// ["world", "Ŵôřlḑôle", "<em>Ŵôřlḑ</em>ôle"],
// // Text containing unicode match.
// ["world", "Ŵôřlḑ", "<em>Ŵôřlḑ</em>"],
// // Text containing unicode match.
// ["westfali", "Westfália", "<em>Westfáli</em>a"],
// ];
// for [query, text, expected_text] in test_values {
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
// let mut matcher = builder.build(text, None);
// // no crop should return complete text with highlighted matches.
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
// }
// }
// #[test]
// fn format_crop() {
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
// let format_options = Some(FormatOptions { highlight: false, crop: Some(10) });
// let test_values = [
// // empty text.
// // ["", ""],
// // text containing only separators.
// // [":-)", ":-)"],
// // Text without any match.
// ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
// "A quick brown fox can not jump 32 feet, right…"],
// // Text without any match starting by a separator.
// ["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)",
// "(A quick brown fox can not jump 32 feet, right…" ],
// // Test phrase propagation
// ["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.",
// "…Split The World is a book written by Emily Henry…"],
// // Text containing some matches.
// ["Natalie risk her future to build a world with the boy she loves.",
// "…future to build a world with the boy she loves."],
// // Text containing all matches.
// ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
// "…she loves. Emily Henry: The Love That Split The World."],
// // Text containing a match unordered and a match ordered.
// ["The world split void void void void void void void void void split the world void void",
// "…void void void void void split the world void void"],
// // Text containing matches with different density.
// ["split void the void void world void void void void void void void void void void split the world void void",
// "…void void void void void split the world void void"],
// ["split split split split split split void void void void void void void void void void split the world void void",
// "…void void void void void split the world void void"]
// ];
// for [text, expected_text] in test_values {
// let mut matcher = builder.build(text, None);
// // no crop should return complete text with highlighted matches.
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
// }
// }
// #[test]
// fn format_highlight_crop() {
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
// let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
// let test_values = [
// // empty text.
// ["", ""],
// // text containing only separators.
// [":-)", ":-)"],
// // Text without any match.
// ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
// "A quick brown fox can not jump 32 feet, right…"],
// // Text containing some matches.
// ["Natalie risk her future to build a world with the boy she loves.",
// "…future to build a <em>world</em> with <em>the</em> boy she loves."],
// // Text containing all matches.
// ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
// "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
// // Text containing a match unordered and a match ordered.
// ["The world split void void void void void void void void void split the world void void",
// "…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"]
// ];
// for [text, expected_text] in test_values {
// let mut matcher = builder.build(text, None);
// // no crop should return complete text with highlighted matches.
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
// }
// }
// #[test]
// fn format_highlight_crop_phrase_query() {
// //! testing: https://github.com/meilisearch/meilisearch/issues/3975
// let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
// let temp_index = temp_index_with_documents(Some(documents!([
// { "id": 1, "text": text }
// ])));
// let rtxn = temp_index.read_txn().unwrap();
// let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
// let test_values = [
// // should return 10 words with a marker at the start as well the end, and the highlighted matches.
// ["\"the world\"",
// "…the power to split <em>the world</em> between those who embraced…"],
// // should highlight "those" and the phrase "and those".
// ["those \"and those\"",
// "…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"],
// ["\"The groundbreaking invention had the power to split the world\"",
// "<em>The groundbreaking invention had the power to split the world</em>…"],
// ["\"The groundbreaking invention had the power to split the world between those\"",
// "<em>The groundbreaking invention had the power to split the world</em>…"],
// ["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
// "…between those who <em>embraced progress and those who resisted change</em>!"],
// ["\"groundbreaking invention\" \"split the world between\"",
// "…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"],
// ["\"groundbreaking invention\" \"had the power to split the world between those\"",
// "…<em>invention</em> <em>had the power to split the world between those</em>…"],
// ];
// for [query, expected_text] in test_values {
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
// let mut matcher = builder.build(text, None);
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
// }
// }
// #[test]
// fn smaller_crop_size() {
// //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
// let text = "void void split the world void void.";
// let mut matcher = builder.build(text, None);
// let test_values = [
// // set a smaller crop size
// // because crop size < query size, partially format matches.
// (2, "…split the…"),
// // set a smaller crop size
// // because crop size < query size, partially format matches.
// (1, "…split…"),
// // set crop size to 0
// // because crop size is 0, crop is ignored.
// (0, "void void split the world void void."),
// ];
// for (crop_size, expected_text) in test_values {
// // set a smaller crop size
// let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) });
// assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
// }
// }
// #[test]
// fn partial_matches() {
// let temp_index = temp_index_with_documents(None);
// let rtxn = temp_index.read_txn().unwrap();
// let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
// let format_options = Some(FormatOptions { highlight: true, crop: None });
// let text = "the do or die can't be he do and or isn't he";
// let mut matcher = builder.build(text, None);
// assert_eq!(
// matcher.get_formatted_text(format_options),
// Some(
// "<em>the</em> <em>do or</em> die can't be he do and or isn'<em>t he</em>"
// .to_string()
// )
// );
// }
}

View file

@ -489,8 +489,7 @@ impl QueryTerm {
let mut words = BTreeSet::new();
let mut phrases = BTreeSet::new();
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
&self.zero_typo;
let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, .. } = &self.zero_typo;
words.extend(zero_typo.iter().copied());
words.extend(prefix_of.iter().copied());
phrases.extend(phrase.iter().copied());