diff --git a/crates/milli/.tmp4e121b/data.mdb b/crates/milli/.tmp4e121b/data.mdb new file mode 100644 index 000000000..f6705d4f1 Binary files /dev/null and b/crates/milli/.tmp4e121b/data.mdb differ diff --git a/crates/milli/.tmp4e121b/lock.mdb b/crates/milli/.tmp4e121b/lock.mdb new file mode 100644 index 000000000..b4ab05270 Binary files /dev/null and b/crates/milli/.tmp4e121b/lock.mdb differ diff --git a/crates/milli/.tmpNxMsye/data.mdb b/crates/milli/.tmpNxMsye/data.mdb new file mode 100644 index 000000000..ea920733d Binary files /dev/null and b/crates/milli/.tmpNxMsye/data.mdb differ diff --git a/crates/milli/.tmpNxMsye/lock.mdb b/crates/milli/.tmpNxMsye/lock.mdb new file mode 100644 index 000000000..abe89541a Binary files /dev/null and b/crates/milli/.tmpNxMsye/lock.mdb differ diff --git a/crates/milli/src/search/new/matches/adjust_indices.rs b/crates/milli/src/search/new/matches/adjust_indices.rs index 737c40ee3..b7d9ad793 100644 --- a/crates/milli/src/search/new/matches/adjust_indices.rs +++ b/crates/milli/src/search/new/matches/adjust_indices.rs @@ -1,152 +1,153 @@ use std::cmp::Ordering; -use charabia::{SeparatorKind, Token, TokenKind}; +use charabia::{SeparatorKind, Token}; -enum SimpleTokenKind { - Separator(SeparatorKind), - NonSeparator, - Done, +#[derive(Clone)] +enum Direction { + Forwards, + Backwards, } -impl SimpleTokenKind { - fn new(token: &Token) -> Self { - match token.kind { - TokenKind::Separator(separator_kind) => Self::Separator(separator_kind), - _ => Self::NonSeparator, - } - } -} - -struct CropBoundsHelper<'a> { - tokens: &'a [Token<'a>], - index_backward: usize, - backward_token_kind: SimpleTokenKind, - index_forward: usize, - forward_token_kind: SimpleTokenKind, -} - -impl CropBoundsHelper<'_> { - fn advance_backward(&mut self) { - if matches!(self.backward_token_kind, SimpleTokenKind::Done) { - return; - } - - if self.index_backward != 0 { - self.index_backward -= 1; - self.backward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_backward]); - } else { - self.backward_token_kind = SimpleTokenKind::Done; - } - } - - fn advance_forward(&mut self) { - if matches!(self.forward_token_kind, SimpleTokenKind::Done) { - return; - } - - if self.index_forward != self.tokens.len() - 1 { - self.index_forward += 1; - self.forward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_forward]); - } else { - self.forward_token_kind = SimpleTokenKind::Done; +impl Direction { + fn switch(&mut self) { + *self = match self { + Direction::Backwards => Direction::Forwards, + Direction::Forwards => Direction::Backwards, } } } fn get_adjusted_indices_for_too_few_words( tokens: &[Token], - index_backward: usize, - index_forward: usize, + mut index_backward: usize, + mut index_forward: usize, mut words_count: usize, crop_size: usize, ) -> [usize; 2] { - let crop_size = crop_size + 2; - let mut cbh = CropBoundsHelper { - tokens, - index_backward, - backward_token_kind: SimpleTokenKind::new(&tokens[index_backward]), - index_forward, - forward_token_kind: SimpleTokenKind::new(&tokens[index_forward]), - }; + let mut valid_index_backward = index_backward; + let mut valid_index_forward = index_forward; + + let mut is_end_reached = index_forward == tokens.len() - 1; + let mut is_beginning_reached = index_backward == 0; + + let mut is_index_backwards_at_hard_separator = false; + let mut is_index_forwards_at_hard_separator = false; + + // false + ends reached because TODO + let mut is_crop_size_or_both_ends_reached = is_end_reached && is_beginning_reached; + + let mut dir = Direction::Forwards; loop { - match [&cbh.backward_token_kind, &cbh.forward_token_kind] { - // if they are both separators and are the same kind then advance both, - // or expand in the soft separator side - [SimpleTokenKind::Separator(backward_sk), SimpleTokenKind::Separator(forward_sk)] => { - if backward_sk == forward_sk { - cbh.advance_backward(); + if is_crop_size_or_both_ends_reached { + break; + } - // this avoids having an ending separator before crop marker - if words_count < crop_size - 1 { - cbh.advance_forward(); + let (index, valid_index) = match dir { + Direction::Backwards => (&mut index_backward, &mut valid_index_backward), + Direction::Forwards => (&mut index_forward, &mut valid_index_forward), + }; + + loop { + match dir { + Direction::Forwards => { + if is_end_reached { + break; } - } else if matches!(backward_sk, SeparatorKind::Hard) { - cbh.advance_forward(); - } else { - cbh.advance_backward(); + + *index += 1; + + is_end_reached = *index == tokens.len() - 1; } + Direction::Backwards => { + if is_beginning_reached + || (!is_end_reached + && is_index_backwards_at_hard_separator + && !is_index_forwards_at_hard_separator) + { + break; + } + + *index -= 1; + + is_beginning_reached = *index == 0; + } + }; + + if is_end_reached && is_beginning_reached { + is_crop_size_or_both_ends_reached = true; } - // both are words, advance left then right if we haven't reached `crop_size` - [SimpleTokenKind::NonSeparator, SimpleTokenKind::NonSeparator] => { - cbh.advance_backward(); + + let maybe_is_token_hard_separator = tokens[*index] + .separator_kind() + .map(|sep_kind| matches!(sep_kind, SeparatorKind::Hard)); + + // it's not a separator + if maybe_is_token_hard_separator.is_none() { + *valid_index = *index; words_count += 1; - if words_count != crop_size { - cbh.advance_forward(); - words_count += 1; + if words_count == crop_size { + is_crop_size_or_both_ends_reached = true; } + + break; } - [SimpleTokenKind::Done, SimpleTokenKind::Done] => break, - // if one of the tokens is non-separator and the other a separator, we expand in the non-separator side - // if one of the sides reached the end, we expand in the opposite direction - [backward_stk, SimpleTokenKind::Done] - | [backward_stk @ SimpleTokenKind::NonSeparator, SimpleTokenKind::Separator(_)] => { - if matches!(backward_stk, SimpleTokenKind::NonSeparator) { - words_count += 1; - } - cbh.advance_backward(); - } - [SimpleTokenKind::Done, forward_stk] - | [SimpleTokenKind::Separator(_), forward_stk @ SimpleTokenKind::NonSeparator] => { - if matches!(forward_stk, SimpleTokenKind::NonSeparator) { - words_count += 1; - } - cbh.advance_forward(); - } + + let is_index_at_hard_separator = match dir { + Direction::Backwards => &mut is_index_backwards_at_hard_separator, + Direction::Forwards => &mut is_index_forwards_at_hard_separator, + }; + *is_index_at_hard_separator = + maybe_is_token_hard_separator.is_some_and(|is_hard| is_hard); } + dir.switch(); + + // 1. if end is reached, we can only advance backwards + // 2. if forwards index reached a hard separator and backwards is currently hard, we can go backwards + } + + // keep advancing forward to check if there's only separator tokens left until the end + // if so, then include those too in the index range + let mut try_index_forward = valid_index_forward + 1; + while let Some(token) = tokens.get(try_index_forward) { + if !token.is_separator() { + return [valid_index_backward, valid_index_forward]; + } + + try_index_forward += 1; + } + + [valid_index_backward, try_index_forward - 1] +} + +fn get_adjusted_index_forward_for_too_many_words( + tokens: &[Token], + index_backward: usize, + mut index_forward: usize, + mut words_count: usize, + crop_size: usize, +) -> usize { + loop { + if index_forward == index_backward { + return index_forward; + } + + index_forward -= 1; + + if tokens[index_forward].is_separator() { + continue; + } + + words_count -= 1; + if words_count == crop_size { break; } } - [cbh.index_backward, cbh.index_forward] -} - -fn get_adjusted_index_forward_for_too_many_words( - tokens: &[Token], - mut index_forward: usize, - mut words_count: usize, - crop_size: usize, -) -> usize { - while index_forward != 0 { - if matches!(SimpleTokenKind::new(&tokens[index_forward]), SimpleTokenKind::NonSeparator) { - words_count -= 1; - - if words_count == crop_size { - break; - } - } - - index_forward -= 1; - } - - if index_forward == 0 { - return index_forward; - } - - index_forward - 1 + index_forward } pub fn get_adjusted_indices_for_highlights_and_crop_size( @@ -164,14 +165,12 @@ pub fn get_adjusted_indices_for_highlights_and_crop_size( words_count, crop_size, ), - Ordering::Equal => [ - if index_backward != 0 { index_backward - 1 } else { index_backward }, - if index_forward != tokens.len() - 1 { index_forward + 1 } else { index_forward }, - ], + Ordering::Equal => [index_backward, index_forward], Ordering::Greater => [ index_backward, get_adjusted_index_forward_for_too_many_words( tokens, + index_backward, index_forward, words_count, crop_size, @@ -185,7 +184,7 @@ pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usi let mut index = 0; while index != tokens.len() - 1 { - if matches!(SimpleTokenKind::new(&tokens[index]), SimpleTokenKind::NonSeparator) { + if !tokens[index].is_separator() { words_count += 1; if words_count == crop_size { diff --git a/crates/milli/src/search/new/matches/match_bounds.rs b/crates/milli/src/search/new/matches/match_bounds.rs index 5e6c0d7d7..44f88b648 100644 --- a/crates/milli/src/search/new/matches/match_bounds.rs +++ b/crates/milli/src/search/new/matches/match_bounds.rs @@ -14,6 +14,7 @@ use utoipa::ToSchema; use super::FormatOptions; +// TODO: Differentiate if full match do not return None, instead return match bounds with full length #[derive(Serialize, ToSchema)] #[serde(rename_all = "camelCase")] pub struct MatchBounds { @@ -158,20 +159,28 @@ impl MatchBoundsHelper<'_> { } /// For crop but no highlight. - fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> Option { + fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> MatchBounds { let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size); let final_token = &self.tokens[final_token_index]; - if final_token_index == self.tokens.len() - 1 { - return None; - } - // TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start? - Some(MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] }) + // TODO: Can here be an error, because it's byte_start but it could be byte_end? + MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] } } fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices { + let asd = |i1, i2| { + println!( + "{}|{}|{}\n{} {}", + self.tokens[..i1].iter().map(|v| v.lemma()).collect::>().join(""), + self.tokens[i1..i2].iter().map(|v| v.lemma()).collect::>().join(""), + self.tokens[i2..].iter().map(|v| v.lemma()).collect::>().join(""), + i1, + i2 + ); + }; + // TODO: This doesn't give back 2 phrases if one is out of crop window // Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them let [matches_first_index, matches_last_index] = @@ -196,28 +205,17 @@ impl MatchBoundsHelper<'_> { crop_size, ); - let is_index_backward_at_limit = index_backward == 0; - let is_index_forward_at_limit = index_forward == self.tokens.len() - 1; + asd(first_match.get_first_token_pos(), last_match.get_last_token_pos()); + asd(index_backward, index_forward); let backward_token = &self.tokens[index_backward]; - let crop_byte_start = if is_index_backward_at_limit { - backward_token.byte_start - } else { - backward_token.byte_end - }; - let forward_token = &self.tokens[index_forward]; - let crop_byte_end = if is_index_forward_at_limit { - forward_token.byte_end - } else { - forward_token.byte_start - }; MatchesAndCropIndices { matches_first_index, matches_last_index, - crop_byte_start, - crop_byte_end, + crop_byte_start: backward_token.byte_start, + crop_byte_end: forward_token.byte_end, } } @@ -248,7 +246,7 @@ impl MatchBounds { if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) { if matches.is_empty() { - return mbh.get_crop_bounds_with_no_matches(crop_size); + return Some(mbh.get_crop_bounds_with_no_matches(crop_size)); } if format_options.highlight { @@ -258,15 +256,15 @@ impl MatchBounds { return Some(mbh.get_crop_bounds_with_matches(crop_size)); } - if format_options.highlight && !matches.is_empty() { - Some(mbh.get_match_bounds(MatchesAndCropIndices { - matches_first_index: 0, - matches_last_index: matches.len() - 1, - crop_byte_start: 0, - crop_byte_end: tokens[tokens.len() - 1].byte_end, - })) - } else { - None + if !format_options.highlight || matches.is_empty() { + return None; } + + Some(mbh.get_match_bounds(MatchesAndCropIndices { + matches_first_index: 0, + matches_last_index: matches.len() - 1, + crop_byte_start: 0, + crop_byte_end: tokens[tokens.len() - 1].byte_end, + })) } } diff --git a/crates/milli/src/search/new/matches/matching_words.rs b/crates/milli/src/search/new/matches/matching_words.rs index fe0aac40e..ab7f90f05 100644 --- a/crates/milli/src/search/new/matches/matching_words.rs +++ b/crates/milli/src/search/new/matches/matching_words.rs @@ -115,17 +115,21 @@ impl MatchingWords { let position = [*positions.start(), *positions.end()]; - located_matching_phrases.reserve(matching_phrases.len()); - located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| { - LocatedMatchingPhrase { value: *matching_phrase, position } - })); + if !matching_phrases.is_empty() { + located_matching_phrases.reserve(matching_phrases.len()); + located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| { + LocatedMatchingPhrase { value: *matching_phrase, position } + })); + } - located_matching_words.push(LocatedMatchingWords { - value: matching_words, - position, - is_prefix: term.is_prefix(), - original_char_count: term.original_word(&ctx).chars().count(), - }); + if !matching_words.is_empty() { + located_matching_words.push(LocatedMatchingWords { + value: matching_words, + position, + is_prefix: term.is_prefix(), + original_char_count: term.original_word(&ctx).chars().count(), + }); + } } // Sort words by having `is_prefix` as false first and then by their lengths in reverse order. @@ -147,12 +151,11 @@ impl MatchingWords { token_position_helper_iter: &mut (impl Iterator> + Clone), ) -> Option<(Match, UserQueryPositionRange)> { let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| { - let words_iter = self - .phrase_interner - .get(lmp.value) - .words + let words = &self.phrase_interner.get(lmp.value).words; + + let words_iter = words .iter() - .map(|word_option| word_option.map(|word| self.word_interner.get(word).as_str())) + .map(|maybe_word| maybe_word.map(|word| self.word_interner.get(word).as_str())) .peekable(); (lmp.position, words_iter) @@ -161,7 +164,7 @@ impl MatchingWords { 'outer: loop { let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?; - // TODO: Is it worth only cloning if we have to? + // TODO: if it's worth it, clone only if we have to let mut tph_iter = token_position_helper_iter.clone(); let mut first_tph_details = None; @@ -241,46 +244,50 @@ impl MatchingWords { tph: TokenPositionHelper, text: &str, ) -> Option<(Match, UserQueryPositionRange)> { - let mut iter = - self.located_matching_words.iter().flat_map(|lw| lw.value.iter().map(move |w| (lw, w))); + // TODO: There is potentially an optimization to be made here + // if we matched a term then we can skip checking it for further iterations? - loop { - let (located_words, word) = iter.next()?; - let word = self.word_interner.get(*word); + self.located_matching_words + .iter() + .flat_map(|lw| lw.value.iter().map(move |w| (lw, w))) + .find_map(|(located_words, word)| { + let word = self.word_interner.get(*word); - let [char_count, byte_len] = - match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) { - PrefixedOrEquality::Prefixed => { - let prefix_byte_len = text[tph.token.byte_start..] - .char_indices() - .nth(located_words.original_char_count - 1) - .map(|(i, c)| i + c.len_utf8()) - .expect("expected text to have n-th thing bal bla TODO"); + let [char_count, byte_len] = + match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) + { + PrefixedOrEquality::Prefixed => { + let prefix_byte_len = text[tph.token.byte_start..] + .char_indices() + .nth(located_words.original_char_count - 1) + .map(|(i, c)| i + c.len_utf8()) + .expect("expected text to have n-th thing bal bla TODO"); - // TODO: Investigate token original byte length and similar methods and why they're not good enough + // TODO: Investigate token original byte length and similar methods and why they're not good enough + // That might be because token original byte length only or could also refer to the normalized byte length - [located_words.original_char_count, prefix_byte_len] - } - // do not +1, because Token index ranges are exclusive - PrefixedOrEquality::Equality => [ - tph.token.char_end - tph.token.char_start, - tph.token.byte_end - tph.token.byte_start, - ], - _ => continue, - }; + [located_words.original_char_count, prefix_byte_len] + } + // do not +1, because Token index ranges are exclusive + PrefixedOrEquality::Equality => [ + tph.token.char_end - tph.token.char_start, + tph.token.byte_end - tph.token.byte_start, + ], + _ => return None, + }; - return Some(( - Match { - char_count, - byte_len, - position: MatchPosition::Word { - word_position: tph.position_by_word, - token_position: tph.position_by_token, + Some(( + Match { + char_count, + byte_len, + position: MatchPosition::Word { + word_position: tph.position_by_word, + token_position: tph.position_by_token, + }, }, - }, - located_words.position, - )); - } + located_words.position, + )) + }) } pub fn get_matches_and_query_positions( @@ -361,93 +368,93 @@ impl Debug for MatchingWords { } } -#[cfg(test)] -pub(crate) mod tests { - use super::super::super::located_query_terms_from_tokens; - use super::*; - use crate::search::new::matches::tests::temp_index_with_documents; - use crate::search::new::query_term::ExtractedTokens; - use charabia::{TokenKind, TokenizerBuilder}; - use std::borrow::Cow; +// #[cfg(test)] +// pub(crate) mod tests { +// use super::super::super::located_query_terms_from_tokens; +// use super::*; +// use crate::search::new::matches::tests::temp_index_with_documents; +// use crate::search::new::query_term::ExtractedTokens; +// use charabia::{TokenKind, TokenizerBuilder}; +// use std::borrow::Cow; - #[test] - fn matching_words() { - let temp_index = temp_index_with_documents(None); - let rtxn = temp_index.read_txn().unwrap(); - let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap(); - let mut builder = TokenizerBuilder::default(); - let tokenizer = builder.build(); - let text = "split this world"; - let tokens = tokenizer.tokenize(text); - let ExtractedTokens { query_terms, .. } = - located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); - let matching_words = MatchingWords::new(ctx, &query_terms); +// #[test] +// fn matching_words() { +// let temp_index = temp_index_with_documents(None); +// let rtxn = temp_index.read_txn().unwrap(); +// let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap(); +// let mut builder = TokenizerBuilder::default(); +// let tokenizer = builder.build(); +// let text = "split this world"; +// let tokens = tokenizer.tokenize(text); +// let ExtractedTokens { query_terms, .. } = +// located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); +// let matching_words = MatchingWords::new(ctx, &query_terms); - assert_eq!( - matching_words.get_matches_and_query_positions( - &[ - Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("split"), - char_end: "split".chars().count(), - byte_end: "split".len(), - ..Default::default() - }, - Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("nyc"), - char_end: "nyc".chars().count(), - byte_end: "nyc".len(), - ..Default::default() - }, - Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("world"), - char_end: "world".chars().count(), - byte_end: "world".len(), - ..Default::default() - }, - Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("worlded"), - char_end: "worlded".chars().count(), - byte_end: "worlded".len(), - ..Default::default() - }, - Token { - kind: TokenKind::Word, - lemma: Cow::Borrowed("thisnew"), - char_end: "thisnew".chars().count(), - byte_end: "thisnew".len(), - ..Default::default() - } - ], - text - ), - ( - vec![ - Match { - char_count: 5, - byte_len: 5, - position: MatchPosition::Word { word_position: 0, token_position: 0 } - }, - Match { - char_count: 5, - byte_len: 5, - position: MatchPosition::Word { word_position: 2, token_position: 2 } - }, - Match { - char_count: 5, - byte_len: 5, - position: MatchPosition::Word { word_position: 3, token_position: 3 } - } - ], - vec![ - QueryPosition { range: [0, 0], index: 0 }, - QueryPosition { range: [2, 2], index: 1 }, - QueryPosition { range: [2, 2], index: 2 } - ] - ) - ); - } -} +// assert_eq!( +// matching_words.get_matches_and_query_positions( +// &[ +// Token { +// kind: TokenKind::Word, +// lemma: Cow::Borrowed("split"), +// char_end: "split".chars().count(), +// byte_end: "split".len(), +// ..Default::default() +// }, +// Token { +// kind: TokenKind::Word, +// lemma: Cow::Borrowed("nyc"), +// char_end: "nyc".chars().count(), +// byte_end: "nyc".len(), +// ..Default::default() +// }, +// Token { +// kind: TokenKind::Word, +// lemma: Cow::Borrowed("world"), +// char_end: "world".chars().count(), +// byte_end: "world".len(), +// ..Default::default() +// }, +// Token { +// kind: TokenKind::Word, +// lemma: Cow::Borrowed("worlded"), +// char_end: "worlded".chars().count(), +// byte_end: "worlded".len(), +// ..Default::default() +// }, +// Token { +// kind: TokenKind::Word, +// lemma: Cow::Borrowed("thisnew"), +// char_end: "thisnew".chars().count(), +// byte_end: "thisnew".len(), +// ..Default::default() +// } +// ], +// text +// ), +// ( +// vec![ +// Match { +// char_count: 5, +// byte_len: 5, +// position: MatchPosition::Word { word_position: 0, token_position: 0 } +// }, +// Match { +// char_count: 5, +// byte_len: 5, +// position: MatchPosition::Word { word_position: 2, token_position: 2 } +// }, +// Match { +// char_count: 5, +// byte_len: 5, +// position: MatchPosition::Word { word_position: 3, token_position: 3 } +// } +// ], +// vec![ +// QueryPosition { range: [0, 0], index: 0 }, +// QueryPosition { range: [2, 2], index: 1 }, +// QueryPosition { range: [2, 2], index: 2 } +// ] +// ) +// ); +// } +// } diff --git a/crates/milli/src/search/new/matches/mod.rs b/crates/milli/src/search/new/matches/mod.rs index 0eab29d93..bab82da8c 100644 --- a/crates/milli/src/search/new/matches/mod.rs +++ b/crates/milli/src/search/new/matches/mod.rs @@ -48,10 +48,9 @@ impl<'a> MatcherBuilder<'a> { } } -#[derive(Copy, Clone, Default, Debug)] +#[derive(Copy, Clone, Default)] pub struct FormatOptions { pub highlight: bool, - // TODO: Should this be usize? pub crop: Option, } @@ -80,7 +79,9 @@ impl Matcher<'_, '_, '_, '_> { /// TODO: description pub fn get_match_bounds( &mut self, - // TODO: Add option to count grapheme clusters instead of bytes + // TODO: Add option to count UTF-16 segments, or whatever JS works with when slicing strings + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#utf-16_characters_unicode_code_points_and_grapheme_clusters + // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice format_options: Option, ) -> Option { if self.text.is_empty() { @@ -152,7 +153,6 @@ mod tests { use crate::index::tests::TempIndex; use crate::{execute_search, filtered_universe, SearchContext, TimeBudget}; use charabia::TokenizerBuilder; - use memmap2::Mmap; impl<'a> MatcherBuilder<'a> { fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self { @@ -180,10 +180,9 @@ mod tests { .unwrap(); // consume context and located_query_terms to build MatchingWords. - let matching_words = match located_query_terms { - Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms), - None => MatchingWords::default(), - }; + let matching_words = located_query_terms + .map(|located_query_terms| MatchingWords::new(ctx, &located_query_terms)) + .unwrap_or_default(); MatcherBuilder::new( matching_words, @@ -197,283 +196,401 @@ mod tests { } } - pub fn temp_index_with_documents(documents: Option) -> TempIndex { + pub fn rename_me( + format_options: Option, + text: &str, + query: &str, + expected_text: &str, + ) { let temp_index = TempIndex::new(); + // document will always contain the same exact text normally + // TODO: Describe this better and ask if this is actually the case temp_index - .add_documents(documents.unwrap_or_else(|| { - documents!([ - { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" }, - { "id": 2, "name": "Westfália" }, - { "id": 3, "name": "Ŵôřlḑôle" }, - ]) - })) + .add_documents(documents!([ + { "id": 1, "text": text.to_string() }, + ])) .unwrap(); - temp_index - } - - fn get_expected_maybe_text(expected_text: &str, text: &str) -> Option { - if expected_text == text { - None - } else { - Some(expected_text.to_string()) - } - } - - #[test] - fn format_identity() { - let temp_index = temp_index_with_documents(None); let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - let format_options = Some(FormatOptions { highlight: false, crop: None }); - - let test_values = [ - // Text without any match. - "A quick brown fox can not jump 32 feet, right? Brr, it is cold!", - // Text containing all matches. - "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", - // Text containing some matches. - "Natalie risk her future to build a world with the boy she loves." - ]; - - for text in test_values { - let mut matcher = builder.build(text, None); - // no crop and no highlight should return complete text. - assert_eq!(matcher.get_formatted_text(format_options), None); - } - } - - #[test] - fn format_highlight() { - let temp_index = temp_index_with_documents(None); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - let format_options = Some(FormatOptions { highlight: true, crop: None }); - - let test_values = [ - // empty text. - ["", ""], - // text containing only separators. - [":-)", ":-)"], - // Text without any match. - ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!", - "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"], - // Text containing all matches. - ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", - "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."], - // Text containing some matches. - ["Natalie risk her future to build a world with the boy she loves.", - "Natalie risk her future to build a world with the boy she loves."], - ]; - - for [text, expected_text] in test_values { - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - assert_eq!( - matcher.get_formatted_text(format_options), - get_expected_maybe_text(expected_text, text) - ); - } - } - - #[test] - fn highlight_unicode() { - let temp_index = temp_index_with_documents(None); - let rtxn = temp_index.read_txn().unwrap(); - let format_options = Some(FormatOptions { highlight: true, crop: None }); - - let test_values = [ - // Text containing prefix match. - ["world", "Ŵôřlḑôle", "Ŵôřlḑôle"], - // Text containing unicode match. - ["world", "Ŵôřlḑ", "Ŵôřlḑ"], - // Text containing unicode match. - ["westfali", "Westfália", "Westfália"], - ]; - - for [query, text, expected_text] in test_values { - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query); - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - assert_eq!( - matcher.get_formatted_text(format_options), - get_expected_maybe_text(expected_text, text) - ); - } - } - - #[test] - fn format_crop() { - let temp_index = temp_index_with_documents(None); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - let format_options = Some(FormatOptions { highlight: false, crop: Some(10) }); - - let test_values = [ - // empty text. - ["", ""], - // text containing only separators. - [":-)", ":-)"], - // Text without any match. - ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!", - "A quick brown fox can not jump 32 feet, right…"], - // Text without any match starting by a separator. - ["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)", - "(A quick brown fox can not jump 32 feet, right…" ], - // Test phrase propagation - ["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.", - "…Split The World is a book written by Emily Henry…"], - // Text containing some matches. - ["Natalie risk her future to build a world with the boy she loves.", - "…future to build a world with the boy she loves."], - // Text containing all matches. - ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", - "…she loves. Emily Henry: The Love That Split The World."], - // Text containing a match unordered and a match ordered. - ["The world split void void void void void void void void void split the world void void", - "…void void void void void split the world void void"], - // Text containing matches with different density. - ["split void the void void world void void void void void void void void void void split the world void void", - "…void void void void void split the world void void"], - ["split split split split split split void void void void void void void void void void split the world void void", - "…void void void void void split the world void void"] - ]; - - for [text, expected_text] in test_values { - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - assert_eq!( - matcher.get_formatted_text(format_options), - get_expected_maybe_text(expected_text, text) - ); - } - } - - #[test] - fn format_highlight_crop() { - let temp_index = temp_index_with_documents(None); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - let format_options = Some(FormatOptions { highlight: true, crop: Some(10) }); - - let test_values = [ - // empty text. - ["", ""], - // text containing only separators. - [":-)", ":-)"], - // Text without any match. - ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!", - "A quick brown fox can not jump 32 feet, right…"], - // Text containing some matches. - ["Natalie risk her future to build a world with the boy she loves.", - "…future to build a world with the boy she loves."], - // Text containing all matches. - ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", - "…she loves. Emily Henry: The Love That Split The World."], - // Text containing a match unordered and a match ordered. - ["The world split void void void void void void void void void split the world void void", - "…void void void void void split the world void void"] - ]; - - for [text, expected_text] in test_values { - let mut matcher = builder.build(text, None); - // no crop should return complete text with highlighted matches. - assert_eq!( - matcher.get_formatted_text(format_options), - get_expected_maybe_text(expected_text, text) - ); - } - } - - #[test] - fn format_highlight_crop_phrase_query() { - //! testing: https://github.com/meilisearch/meilisearch/issues/3975 - let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; - let temp_index = temp_index_with_documents(Some(documents!([ - { "id": 1, "text": text } - ]))); - let rtxn = temp_index.read_txn().unwrap(); - - let format_options = Some(FormatOptions { highlight: true, crop: Some(10) }); - - let test_values = [ - // should return 10 words with a marker at the start as well the end, and the highlighted matches. - ["\"the world\"", - "…the power to split the world between those who embraced…"], - // should highlight "those" and the phrase "and those". - ["those \"and those\"", - "…world between those who embraced progress and those who resisted…"], - ["\"The groundbreaking invention had the power to split the world\"", - "The groundbreaking invention had the power to split the world…"], - ["\"The groundbreaking invention had the power to split the world between those\"", - "The groundbreaking invention had the power to split the world…"], - ["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", - "…between those who embraced progress and those who resisted change!"], - ["\"groundbreaking invention\" \"split the world between\"", - "…groundbreaking invention had the power to split the world between…"], - ["\"groundbreaking invention\" \"had the power to split the world between those\"", - "…invention had the power to split the world between those…"], - ]; - - for [query, expected_text] in test_values { - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query); - let mut matcher = builder.build(text, None); - - assert_eq!( - matcher.get_formatted_text(format_options), - get_expected_maybe_text(expected_text, text) - ); - } - } - - #[test] - fn smaller_crop_size() { - //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 - let temp_index = temp_index_with_documents(None); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); - let text = "void void split the world void void."; + let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query); let mut matcher = builder.build(text, None); - let test_values = [ - // set a smaller crop size - // because crop size < query size, partially format matches. - (2, "…split the…"), - // set a smaller crop size - // because crop size < query size, partially format matches. - (1, "…split…"), - // set crop size to 0 - // because crop size is 0, crop is ignored. - (0, "void void split the world void void."), - ]; - - for (crop_size, expected_text) in test_values { - // set a smaller crop size - let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) }); - assert_eq!( - matcher.get_formatted_text(format_options), - get_expected_maybe_text(expected_text, text) - ); - } + assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string())); } - #[test] - fn partial_matches() { - let temp_index = temp_index_with_documents(None); - let rtxn = temp_index.read_txn().unwrap(); - let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\""); - - let format_options = Some(FormatOptions { highlight: true, crop: None }); - - let text = "the do or die can't be he do and or isn't he"; - let mut matcher = builder.build(text, None); - assert_eq!( - matcher.get_formatted_text(format_options), - Some( - "the do or die can't be he do and or isn't he" - .to_string() - ) + /// "Dei store fiskane eta dei små — dei liger under som minst förmå." + /// + /// (Men are like fish; the great ones devour the small.) + fn rename_me_with_base_text( + format_options: Option, + query: &str, + expected_text: &str, + ) { + rename_me( + format_options, + "Dei store fiskane eta dei små — dei liger under som minst förmå.", + query, + expected_text, ); } + + #[test] + fn phrase_highlight_bigger_than_crop() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(1) }), + "\"dei liger\"", + "…dei…", + ); + } + + #[test] + fn phrase_highlight_same_size_as_crop() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(2) }), + "\"dei liger\"", + "…dei liger…", + ); + } + + #[test] + fn phrase_highlight_crop_middle() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(4) }), + "\"dei liger\"", + "…små — dei liger under…", + ); + } + + #[test] + fn phrase_highlight_crop_end() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(4) }), + "\"minst förmå\"", + "…under som minst förmå.", + ); + } + + #[test] + fn phrase_highlight_crop_beginning() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: Some(4) }), + "\"Dei store\"", + "Dei store fiskane eta…", + ); + } + + #[test] + fn highlight_end() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: None }), + "minst förmå", + "Dei store fiskane eta dei små — dei liger under som minst förmå.", + ); + } + + #[test] + fn highlight_beginning_and_middle() { + rename_me_with_base_text( + Some(FormatOptions { highlight: true, crop: None }), + "Dei store", + "Dei store fiskane eta dei små — dei liger under som minst förmå.", + ); + } + + #[test] + fn partial_match_middle() { + // TODO: Is this intentional? + // Here the only interned word is "forma", hence it cannot find the searched prefix + // word "fo" inside "forma" within milli::search::new::matches::matching_words::MatchingWords::try_get_word_match + // `milli::search::new::query_term::QueryTerm::all_computed_derivations` might be at fault here + + // interned words = ["forma"] + rename_me( + Some(FormatOptions { highlight: true, crop: None }), + "altså, förmå, på en måte", + "fo", + "altså, förmå, på en måte", + ); + + // interned words = ["fo", "forma"] + rename_me( + Some(FormatOptions { highlight: true, crop: None }), + "altså, fo förmå, på en måte", + "fo", + "altså, fo rmå, på en måte", + ); + } + + #[test] + fn partial_match_end() { + rename_me( + Some(FormatOptions { highlight: true, crop: None }), + "förmå, på en måte", + "fo", + "förmå, på en måte", + ); + + rename_me( + Some(FormatOptions { highlight: true, crop: None }), + "fo förmå, på en måte", + "fo", + "fo rmå, på en måte", + ); + } + + #[test] + fn partial_match_beginning() { + rename_me( + Some(FormatOptions { highlight: true, crop: None }), + "altså, förmå", + "fo", + "altså, förmå", + ); + + rename_me( + Some(FormatOptions { highlight: true, crop: None }), + "altså, fo förmå", + "fo", + "altså, fo rmå", + ); + } + + // #[test] + // fn format_identity() { + // let temp_index = temp_index_with_documents(None); + // let rtxn = temp_index.read_txn().unwrap(); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); + // let format_options = Some(FormatOptions { highlight: false, crop: None }); + + // let test_values = [ + // // Text without any match. + // "A quick brown fox can not jump 32 feet, right? Brr, it is cold!", + // // Text containing all matches. + // "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", + // // Text containing some matches. + // "Natalie risk her future to build a world with the boy she loves." + // ]; + + // for text in test_values { + // let mut matcher = builder.build(text, None); + // // no crop and no highlight should return complete text. + // assert_eq!(matcher.get_formatted_text(format_options), None); + // } + // } + + // #[test] + // fn format_highlight() { + // let temp_index = temp_index_with_documents(None); + // let rtxn = temp_index.read_txn().unwrap(); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); + // let format_options = Some(FormatOptions { highlight: true, crop: None }); + + // let test_values = [ + // // empty text. + // ["", ""], + // // text containing only separators. + // [":-)", ":-)"], + // // Text without any match. + // ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!", + // "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"], + // // Text containing all matches. + // ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", + // "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World."], + // // Text containing some matches. + // ["Natalie risk her future to build a world with the boy she loves.", + // "Natalie risk her future to build a world with the boy she loves."], + // ]; + + // for [text, expected_text] in test_values { + // let mut matcher = builder.build(text, None); + // // no crop should return complete text with highlighted matches. + // assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string())); + // } + // } + + // #[test] + // fn highlight_unicode() { + // let temp_index = temp_index_with_documents(None); + // let rtxn = temp_index.read_txn().unwrap(); + // let format_options = Some(FormatOptions { highlight: true, crop: None }); + + // let test_values = [ + // // Text containing prefix match. + // ["world", "Ŵôřlḑôle", "Ŵôřlḑôle"], + // // Text containing unicode match. + // ["world", "Ŵôřlḑ", "Ŵôřlḑ"], + // // Text containing unicode match. + // ["westfali", "Westfália", "Westfália"], + // ]; + + // for [query, text, expected_text] in test_values { + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query); + // let mut matcher = builder.build(text, None); + // // no crop should return complete text with highlighted matches. + // assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string())); + // } + // } + + // #[test] + // fn format_crop() { + // let temp_index = temp_index_with_documents(None); + // let rtxn = temp_index.read_txn().unwrap(); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); + // let format_options = Some(FormatOptions { highlight: false, crop: Some(10) }); + + // let test_values = [ + // // empty text. + // // ["", ""], + // // text containing only separators. + // // [":-)", ":-)"], + // // Text without any match. + // ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!", + // "A quick brown fox can not jump 32 feet, right…"], + // // Text without any match starting by a separator. + // ["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)", + // "(A quick brown fox can not jump 32 feet, right…" ], + // // Test phrase propagation + // ["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.", + // "…Split The World is a book written by Emily Henry…"], + // // Text containing some matches. + // ["Natalie risk her future to build a world with the boy she loves.", + // "…future to build a world with the boy she loves."], + // // Text containing all matches. + // ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", + // "…she loves. Emily Henry: The Love That Split The World."], + // // Text containing a match unordered and a match ordered. + // ["The world split void void void void void void void void void split the world void void", + // "…void void void void void split the world void void"], + // // Text containing matches with different density. + // ["split void the void void world void void void void void void void void void void split the world void void", + // "…void void void void void split the world void void"], + // ["split split split split split split void void void void void void void void void void split the world void void", + // "…void void void void void split the world void void"] + // ]; + + // for [text, expected_text] in test_values { + // let mut matcher = builder.build(text, None); + // // no crop should return complete text with highlighted matches. + // assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string())); + // } + // } + + // #[test] + // fn format_highlight_crop() { + // let temp_index = temp_index_with_documents(None); + // let rtxn = temp_index.read_txn().unwrap(); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); + // let format_options = Some(FormatOptions { highlight: true, crop: Some(10) }); + + // let test_values = [ + // // empty text. + // ["", ""], + // // text containing only separators. + // [":-)", ":-)"], + // // Text without any match. + // ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!", + // "A quick brown fox can not jump 32 feet, right…"], + // // Text containing some matches. + // ["Natalie risk her future to build a world with the boy she loves.", + // "…future to build a world with the boy she loves."], + // // Text containing all matches. + // ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", + // "…she loves. Emily Henry: The Love That Split The World."], + // // Text containing a match unordered and a match ordered. + // ["The world split void void void void void void void void void split the world void void", + // "…void void void void void split the world void void"] + // ]; + + // for [text, expected_text] in test_values { + // let mut matcher = builder.build(text, None); + // // no crop should return complete text with highlighted matches. + // assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string())); + // } + // } + + // #[test] + // fn format_highlight_crop_phrase_query() { + // //! testing: https://github.com/meilisearch/meilisearch/issues/3975 + // let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!"; + // let temp_index = temp_index_with_documents(Some(documents!([ + // { "id": 1, "text": text } + // ]))); + // let rtxn = temp_index.read_txn().unwrap(); + + // let format_options = Some(FormatOptions { highlight: true, crop: Some(10) }); + + // let test_values = [ + // // should return 10 words with a marker at the start as well the end, and the highlighted matches. + // ["\"the world\"", + // "…the power to split the world between those who embraced…"], + // // should highlight "those" and the phrase "and those". + // ["those \"and those\"", + // "…world between those who embraced progress and those who resisted…"], + // ["\"The groundbreaking invention had the power to split the world\"", + // "The groundbreaking invention had the power to split the world…"], + // ["\"The groundbreaking invention had the power to split the world between those\"", + // "The groundbreaking invention had the power to split the world…"], + // ["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", + // "…between those who embraced progress and those who resisted change!"], + // ["\"groundbreaking invention\" \"split the world between\"", + // "…groundbreaking invention had the power to split the world between…"], + // ["\"groundbreaking invention\" \"had the power to split the world between those\"", + // "…invention had the power to split the world between those…"], + // ]; + + // for [query, expected_text] in test_values { + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query); + // let mut matcher = builder.build(text, None); + + // assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string())); + // } + // } + + // #[test] + // fn smaller_crop_size() { + // //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295 + // let temp_index = temp_index_with_documents(None); + // let rtxn = temp_index.read_txn().unwrap(); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world"); + // let text = "void void split the world void void."; + // let mut matcher = builder.build(text, None); + + // let test_values = [ + // // set a smaller crop size + // // because crop size < query size, partially format matches. + // (2, "…split the…"), + // // set a smaller crop size + // // because crop size < query size, partially format matches. + // (1, "…split…"), + // // set crop size to 0 + // // because crop size is 0, crop is ignored. + // (0, "void void split the world void void."), + // ]; + + // for (crop_size, expected_text) in test_values { + // // set a smaller crop size + // let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) }); + // assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string())); + // } + // } + + // #[test] + // fn partial_matches() { + // let temp_index = temp_index_with_documents(None); + // let rtxn = temp_index.read_txn().unwrap(); + // let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\""); + + // let format_options = Some(FormatOptions { highlight: true, crop: None }); + + // let text = "the do or die can't be he do and or isn't he"; + // let mut matcher = builder.build(text, None); + // assert_eq!( + // matcher.get_formatted_text(format_options), + // Some( + // "the do or die can't be he do and or isn't he" + // .to_string() + // ) + // ); + // } } diff --git a/crates/milli/src/search/new/query_term/mod.rs b/crates/milli/src/search/new/query_term/mod.rs index ba8964e34..748248fc3 100644 --- a/crates/milli/src/search/new/query_term/mod.rs +++ b/crates/milli/src/search/new/query_term/mod.rs @@ -489,8 +489,7 @@ impl QueryTerm { let mut words = BTreeSet::new(); let mut phrases = BTreeSet::new(); - let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } = - &self.zero_typo; + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, .. } = &self.zero_typo; words.extend(zero_typo.iter().copied()); words.extend(prefix_of.iter().copied()); phrases.extend(phrase.iter().copied());