Refactor, fix, misc

2025-07-04 20:37:15 +02:00 · 2025-06-15 14:18:18 +03:00 · 2025-06-15 14:18:18 +03:00 · 561b4836d8
commit 561b4836d8
parent 24f213c343
9 changed files with 689 additions and 569 deletions
--- a/crates/milli/.tmp4e121b/data.mdb
+++ b/crates/milli/.tmp4e121b/data.mdb
--- a/crates/milli/.tmp4e121b/lock.mdb
+++ b/crates/milli/.tmp4e121b/lock.mdb
--- a/crates/milli/.tmpNxMsye/data.mdb
+++ b/crates/milli/.tmpNxMsye/data.mdb
--- a/crates/milli/.tmpNxMsye/lock.mdb
+++ b/crates/milli/.tmpNxMsye/lock.mdb
--- a/crates/milli/src/search/new/matches/adjust_indices.rs
+++ b/crates/milli/src/search/new/matches/adjust_indices.rs
@ -1,152 +1,153 @@
 use std::cmp::Ordering;

-use charabia::{SeparatorKind, Token, TokenKind};
+use charabia::{SeparatorKind, Token};

-enum SimpleTokenKind {
-    Separator(SeparatorKind),
-    NonSeparator,
-    Done,
+#[derive(Clone)]
+enum Direction {
+    Forwards,
+    Backwards,
 }

-impl SimpleTokenKind {
-    fn new(token: &Token) -> Self {
-        match token.kind {
-            TokenKind::Separator(separator_kind) => Self::Separator(separator_kind),
-            _ => Self::NonSeparator,
-        }
-    }
-}
-
-struct CropBoundsHelper<'a> {
-    tokens: &'a [Token<'a>],
-    index_backward: usize,
-    backward_token_kind: SimpleTokenKind,
-    index_forward: usize,
-    forward_token_kind: SimpleTokenKind,
-}
-
-impl CropBoundsHelper<'_> {
-    fn advance_backward(&mut self) {
-        if matches!(self.backward_token_kind, SimpleTokenKind::Done) {
-            return;
-        }
-
-        if self.index_backward != 0 {
-            self.index_backward -= 1;
-            self.backward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_backward]);
-        } else {
-            self.backward_token_kind = SimpleTokenKind::Done;
-        }
-    }
-
-    fn advance_forward(&mut self) {
-        if matches!(self.forward_token_kind, SimpleTokenKind::Done) {
-            return;
-        }
-
-        if self.index_forward != self.tokens.len() - 1 {
-            self.index_forward += 1;
-            self.forward_token_kind = SimpleTokenKind::new(&self.tokens[self.index_forward]);
-        } else {
-            self.forward_token_kind = SimpleTokenKind::Done;
+impl Direction {
+    fn switch(&mut self) {
+        *self = match self {
+            Direction::Backwards => Direction::Forwards,
+            Direction::Forwards => Direction::Backwards,
        }
    }
 }

 fn get_adjusted_indices_for_too_few_words(
    tokens: &[Token],
-    index_backward: usize,
-    index_forward: usize,
+    mut index_backward: usize,
+    mut index_forward: usize,
    mut words_count: usize,
    crop_size: usize,
 ) -> [usize; 2] {
-    let crop_size = crop_size + 2;
-    let mut cbh = CropBoundsHelper {
-        tokens,
-        index_backward,
-        backward_token_kind: SimpleTokenKind::new(&tokens[index_backward]),
-        index_forward,
-        forward_token_kind: SimpleTokenKind::new(&tokens[index_forward]),
-    };
+    let mut valid_index_backward = index_backward;
+    let mut valid_index_forward = index_forward;
+
+    let mut is_end_reached = index_forward == tokens.len() - 1;
+    let mut is_beginning_reached = index_backward == 0;
+
+    let mut is_index_backwards_at_hard_separator = false;
+    let mut is_index_forwards_at_hard_separator = false;
+
+    // false + ends reached because TODO
+    let mut is_crop_size_or_both_ends_reached = is_end_reached && is_beginning_reached;
+
+    let mut dir = Direction::Forwards;

    loop {
-        match [&cbh.backward_token_kind, &cbh.forward_token_kind] {
-            // if they are both separators and are the same kind then advance both,
-            // or expand in the soft separator side
-            [SimpleTokenKind::Separator(backward_sk), SimpleTokenKind::Separator(forward_sk)] => {
-                if backward_sk == forward_sk {
-                    cbh.advance_backward();
+        if is_crop_size_or_both_ends_reached {
+            break;
+        }

-                    // this avoids having an ending separator before crop marker
-                    if words_count < crop_size - 1 {
-                        cbh.advance_forward();
+        let (index, valid_index) = match dir {
+            Direction::Backwards => (&mut index_backward, &mut valid_index_backward),
+            Direction::Forwards => (&mut index_forward, &mut valid_index_forward),
+        };
+
+        loop {
+            match dir {
+                Direction::Forwards => {
+                    if is_end_reached {
+                        break;
                    }
-                } else if matches!(backward_sk, SeparatorKind::Hard) {
-                    cbh.advance_forward();
-                } else {
-                    cbh.advance_backward();
+
+                    *index += 1;
+
+                    is_end_reached = *index == tokens.len() - 1;
                }
+                Direction::Backwards => {
+                    if is_beginning_reached
+                        || (!is_end_reached
+                            && is_index_backwards_at_hard_separator
+                            && !is_index_forwards_at_hard_separator)
+                    {
+                        break;
+                    }
+
+                    *index -= 1;
+
+                    is_beginning_reached = *index == 0;
+                }
+            };
+
+            if is_end_reached && is_beginning_reached {
+                is_crop_size_or_both_ends_reached = true;
            }
-            // both are words, advance left then right if we haven't reached `crop_size`
-            [SimpleTokenKind::NonSeparator, SimpleTokenKind::NonSeparator] => {
-                cbh.advance_backward();
+
+            let maybe_is_token_hard_separator = tokens[*index]
+                .separator_kind()
+                .map(|sep_kind| matches!(sep_kind, SeparatorKind::Hard));
+
+            // it's not a separator
+            if maybe_is_token_hard_separator.is_none() {
+                *valid_index = *index;
                words_count += 1;

-                if words_count != crop_size {
-                    cbh.advance_forward();
-                    words_count += 1;
+                if words_count == crop_size {
+                    is_crop_size_or_both_ends_reached = true;
                }
+
+                break;
            }
-            [SimpleTokenKind::Done, SimpleTokenKind::Done] => break,
-            // if one of the tokens is non-separator and the other a separator, we expand in the non-separator side
-            // if one of the sides reached the end, we expand in the opposite direction
-            [backward_stk, SimpleTokenKind::Done]
-            | [backward_stk @ SimpleTokenKind::NonSeparator, SimpleTokenKind::Separator(_)] => {
-                if matches!(backward_stk, SimpleTokenKind::NonSeparator) {
-                    words_count += 1;
-                }
-                cbh.advance_backward();
-            }
-            [SimpleTokenKind::Done, forward_stk]
-            | [SimpleTokenKind::Separator(_), forward_stk @ SimpleTokenKind::NonSeparator] => {
-                if matches!(forward_stk, SimpleTokenKind::NonSeparator) {
-                    words_count += 1;
-                }
-                cbh.advance_forward();
-            }
+
+            let is_index_at_hard_separator = match dir {
+                Direction::Backwards => &mut is_index_backwards_at_hard_separator,
+                Direction::Forwards => &mut is_index_forwards_at_hard_separator,
+            };
+            *is_index_at_hard_separator =
+                maybe_is_token_hard_separator.is_some_and(|is_hard| is_hard);
        }

+        dir.switch();
+
+        // 1. if end is reached, we can only advance backwards
+        // 2. if forwards index reached a hard separator and backwards is currently hard, we can go backwards
+    }
+
+    // keep advancing forward to check if there's only separator tokens left until the end
+    // if so, then include those too in the index range
+    let mut try_index_forward = valid_index_forward + 1;
+    while let Some(token) = tokens.get(try_index_forward) {
+        if !token.is_separator() {
+            return [valid_index_backward, valid_index_forward];
+        }
+
+        try_index_forward += 1;
+    }
+
+    [valid_index_backward, try_index_forward - 1]
+}
+
+fn get_adjusted_index_forward_for_too_many_words(
+    tokens: &[Token],
+    index_backward: usize,
+    mut index_forward: usize,
+    mut words_count: usize,
+    crop_size: usize,
+) -> usize {
+    loop {
+        if index_forward == index_backward {
+            return index_forward;
+        }
+
+        index_forward -= 1;
+
+        if tokens[index_forward].is_separator() {
+            continue;
+        }
+
+        words_count -= 1;
+
        if words_count == crop_size {
            break;
        }
    }

-    [cbh.index_backward, cbh.index_forward]
-}
-
-fn get_adjusted_index_forward_for_too_many_words(
-    tokens: &[Token],
-    mut index_forward: usize,
-    mut words_count: usize,
-    crop_size: usize,
-) -> usize {
-    while index_forward != 0 {
-        if matches!(SimpleTokenKind::new(&tokens[index_forward]), SimpleTokenKind::NonSeparator) {
-            words_count -= 1;
-
-            if words_count == crop_size {
-                break;
-            }
-        }
-
-        index_forward -= 1;
-    }
-
-    if index_forward == 0 {
-        return index_forward;
-    }
-
-    index_forward - 1
+    index_forward
 }

 pub fn get_adjusted_indices_for_highlights_and_crop_size(
@ -164,14 +165,12 @@ pub fn get_adjusted_indices_for_highlights_and_crop_size(
            words_count,
            crop_size,
        ),
-        Ordering::Equal => [
-            if index_backward != 0 { index_backward - 1 } else { index_backward },
-            if index_forward != tokens.len() - 1 { index_forward + 1 } else { index_forward },
-        ],
+        Ordering::Equal => [index_backward, index_forward],
        Ordering::Greater => [
            index_backward,
            get_adjusted_index_forward_for_too_many_words(
                tokens,
+                index_backward,
                index_forward,
                words_count,
                crop_size,
@ -185,7 +184,7 @@ pub fn get_adjusted_index_forward_for_crop_size(tokens: &[Token], crop_size: usi
    let mut index = 0;

    while index != tokens.len() - 1 {
-        if matches!(SimpleTokenKind::new(&tokens[index]), SimpleTokenKind::NonSeparator) {
+        if !tokens[index].is_separator() {
            words_count += 1;

            if words_count == crop_size {
--- a/crates/milli/src/search/new/matches/match_bounds.rs
+++ b/crates/milli/src/search/new/matches/match_bounds.rs
@ -14,6 +14,7 @@ use utoipa::ToSchema;

 use super::FormatOptions;

+// TODO: Differentiate if full match do not return None, instead return match bounds with full length
 #[derive(Serialize, ToSchema)]
 #[serde(rename_all = "camelCase")]
 pub struct MatchBounds {
@ -158,20 +159,28 @@ impl MatchBoundsHelper<'_> {
    }

    /// For crop but no highlight.
-    fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> Option<MatchBounds> {
+    fn get_crop_bounds_with_no_matches(&self, crop_size: usize) -> MatchBounds {
        let final_token_index = get_adjusted_index_forward_for_crop_size(self.tokens, crop_size);
        let final_token = &self.tokens[final_token_index];

-        if final_token_index == self.tokens.len() - 1 {
-            return None;
-        }
-
        // TODO: Why is it that when we match all of the tokens we need to get byte_end instead of start?

-        Some(MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] })
+        // TODO: Can here be an error, because it's byte_start but it could be byte_end?
+        MatchBounds { highlight_toggle: false, indices: vec![0, final_token.byte_start] }
    }

    fn get_matches_and_crop_indices(&self, crop_size: usize) -> MatchesAndCropIndices {
+        let asd = |i1, i2| {
+            println!(
+                "{}|{}|{}\n{} {}",
+                self.tokens[..i1].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
+                self.tokens[i1..i2].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
+                self.tokens[i2..].iter().map(|v| v.lemma()).collect::<Vec<_>>().join(""),
+                i1,
+                i2
+            );
+        };
+
        // TODO: This doesn't give back 2 phrases if one is out of crop window
        // Solution: also get next and previous matches, and if they're in the crop window, even if partially, highlight them
        let [matches_first_index, matches_last_index] =
@ -196,28 +205,17 @@ impl MatchBoundsHelper<'_> {
            crop_size,
        );

-        let is_index_backward_at_limit = index_backward == 0;
-        let is_index_forward_at_limit = index_forward == self.tokens.len() - 1;
+        asd(first_match.get_first_token_pos(), last_match.get_last_token_pos());
+        asd(index_backward, index_forward);

        let backward_token = &self.tokens[index_backward];
-        let crop_byte_start = if is_index_backward_at_limit {
-            backward_token.byte_start
-        } else {
-            backward_token.byte_end
-        };
-
        let forward_token = &self.tokens[index_forward];
-        let crop_byte_end = if is_index_forward_at_limit {
-            forward_token.byte_end
-        } else {
-            forward_token.byte_start
-        };

        MatchesAndCropIndices {
            matches_first_index,
            matches_last_index,
-            crop_byte_start,
-            crop_byte_end,
+            crop_byte_start: backward_token.byte_start,
+            crop_byte_end: forward_token.byte_end,
        }
    }

@ -248,7 +246,7 @@ impl MatchBounds {

        if let Some(crop_size) = format_options.crop.filter(|v| *v != 0) {
            if matches.is_empty() {
-                return mbh.get_crop_bounds_with_no_matches(crop_size);
+                return Some(mbh.get_crop_bounds_with_no_matches(crop_size));
            }

            if format_options.highlight {
@ -258,15 +256,15 @@ impl MatchBounds {
            return Some(mbh.get_crop_bounds_with_matches(crop_size));
        }

-        if format_options.highlight && !matches.is_empty() {
-            Some(mbh.get_match_bounds(MatchesAndCropIndices {
-                matches_first_index: 0,
-                matches_last_index: matches.len() - 1,
-                crop_byte_start: 0,
-                crop_byte_end: tokens[tokens.len() - 1].byte_end,
-            }))
-        } else {
-            None
+        if !format_options.highlight || matches.is_empty() {
+            return None;
        }
+
+        Some(mbh.get_match_bounds(MatchesAndCropIndices {
+            matches_first_index: 0,
+            matches_last_index: matches.len() - 1,
+            crop_byte_start: 0,
+            crop_byte_end: tokens[tokens.len() - 1].byte_end,
+        }))
    }
 }
--- a/crates/milli/src/search/new/matches/matching_words.rs
+++ b/crates/milli/src/search/new/matches/matching_words.rs
@ -115,17 +115,21 @@ impl MatchingWords {

            let position = [*positions.start(), *positions.end()];

-            located_matching_phrases.reserve(matching_phrases.len());
-            located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
-                LocatedMatchingPhrase { value: *matching_phrase, position }
-            }));
+            if !matching_phrases.is_empty() {
+                located_matching_phrases.reserve(matching_phrases.len());
+                located_matching_phrases.extend(matching_phrases.iter().map(|matching_phrase| {
+                    LocatedMatchingPhrase { value: *matching_phrase, position }
+                }));
+            }

-            located_matching_words.push(LocatedMatchingWords {
-                value: matching_words,
-                position,
-                is_prefix: term.is_prefix(),
-                original_char_count: term.original_word(&ctx).chars().count(),
-            });
+            if !matching_words.is_empty() {
+                located_matching_words.push(LocatedMatchingWords {
+                    value: matching_words,
+                    position,
+                    is_prefix: term.is_prefix(),
+                    original_char_count: term.original_word(&ctx).chars().count(),
+                });
+            }
        }

        // Sort words by having `is_prefix` as false first and then by their lengths in reverse order.
@ -147,12 +151,11 @@ impl MatchingWords {
        token_position_helper_iter: &mut (impl Iterator<Item = TokenPositionHelper<'a>> + Clone),
    ) -> Option<(Match, UserQueryPositionRange)> {
        let mut mapped_phrase_iter = self.located_matching_phrases.iter().map(|lmp| {
-            let words_iter = self
-                .phrase_interner
-                .get(lmp.value)
-                .words
+            let words = &self.phrase_interner.get(lmp.value).words;
+
+            let words_iter = words
                .iter()
-                .map(|word_option| word_option.map(|word| self.word_interner.get(word).as_str()))
+                .map(|maybe_word| maybe_word.map(|word| self.word_interner.get(word).as_str()))
                .peekable();

            (lmp.position, words_iter)
@ -161,7 +164,7 @@ impl MatchingWords {
        'outer: loop {
            let (query_position_range, mut words_iter) = mapped_phrase_iter.next()?;

-            // TODO: Is it worth only cloning if we have to?
+            // TODO: if it's worth it, clone only if we have to
            let mut tph_iter = token_position_helper_iter.clone();

            let mut first_tph_details = None;
@ -241,46 +244,50 @@ impl MatchingWords {
        tph: TokenPositionHelper,
        text: &str,
    ) -> Option<(Match, UserQueryPositionRange)> {
-        let mut iter =
-            self.located_matching_words.iter().flat_map(|lw| lw.value.iter().map(move |w| (lw, w)));
+        // TODO: There is potentially an optimization to be made here
+        // if we matched a term then we can skip checking it for further iterations?

-        loop {
-            let (located_words, word) = iter.next()?;
-            let word = self.word_interner.get(*word);
+        self.located_matching_words
+            .iter()
+            .flat_map(|lw| lw.value.iter().map(move |w| (lw, w)))
+            .find_map(|(located_words, word)| {
+                let word = self.word_interner.get(*word);

-            let [char_count, byte_len] =
-                match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix) {
-                    PrefixedOrEquality::Prefixed => {
-                        let prefix_byte_len = text[tph.token.byte_start..]
-                            .char_indices()
-                            .nth(located_words.original_char_count - 1)
-                            .map(|(i, c)| i + c.len_utf8())
-                            .expect("expected text to have n-th thing bal bla TODO");
+                let [char_count, byte_len] =
+                    match PrefixedOrEquality::new(tph.token.lemma(), word, located_words.is_prefix)
+                    {
+                        PrefixedOrEquality::Prefixed => {
+                            let prefix_byte_len = text[tph.token.byte_start..]
+                                .char_indices()
+                                .nth(located_words.original_char_count - 1)
+                                .map(|(i, c)| i + c.len_utf8())
+                                .expect("expected text to have n-th thing bal bla TODO");

-                        // TODO: Investigate token original byte length and similar methods and why they're not good enough
+                            // TODO: Investigate token original byte length and similar methods and why they're not good enough
+                            //       That might be because token original byte length only or could also refer to the normalized byte length

-                        [located_words.original_char_count, prefix_byte_len]
-                    }
-                    // do not +1, because Token index ranges are exclusive
-                    PrefixedOrEquality::Equality => [
-                        tph.token.char_end - tph.token.char_start,
-                        tph.token.byte_end - tph.token.byte_start,
-                    ],
-                    _ => continue,
-                };
+                            [located_words.original_char_count, prefix_byte_len]
+                        }
+                        // do not +1, because Token index ranges are exclusive
+                        PrefixedOrEquality::Equality => [
+                            tph.token.char_end - tph.token.char_start,
+                            tph.token.byte_end - tph.token.byte_start,
+                        ],
+                        _ => return None,
+                    };

-            return Some((
-                Match {
-                    char_count,
-                    byte_len,
-                    position: MatchPosition::Word {
-                        word_position: tph.position_by_word,
-                        token_position: tph.position_by_token,
+                Some((
+                    Match {
+                        char_count,
+                        byte_len,
+                        position: MatchPosition::Word {
+                            word_position: tph.position_by_word,
+                            token_position: tph.position_by_token,
+                        },
                    },
-                },
-                located_words.position,
-            ));
-        }
+                    located_words.position,
+                ))
+            })
    }

    pub fn get_matches_and_query_positions(
@ -361,93 +368,93 @@ impl Debug for MatchingWords {
    }
 }

-#[cfg(test)]
-pub(crate) mod tests {
-    use super::super::super::located_query_terms_from_tokens;
-    use super::*;
-    use crate::search::new::matches::tests::temp_index_with_documents;
-    use crate::search::new::query_term::ExtractedTokens;
-    use charabia::{TokenKind, TokenizerBuilder};
-    use std::borrow::Cow;
+// #[cfg(test)]
+// pub(crate) mod tests {
+//     use super::super::super::located_query_terms_from_tokens;
+//     use super::*;
+//     use crate::search::new::matches::tests::temp_index_with_documents;
+//     use crate::search::new::query_term::ExtractedTokens;
+//     use charabia::{TokenKind, TokenizerBuilder};
+//     use std::borrow::Cow;

-    #[test]
-    fn matching_words() {
-        let temp_index = temp_index_with_documents(None);
-        let rtxn = temp_index.read_txn().unwrap();
-        let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
-        let mut builder = TokenizerBuilder::default();
-        let tokenizer = builder.build();
-        let text = "split this world";
-        let tokens = tokenizer.tokenize(text);
-        let ExtractedTokens { query_terms, .. } =
-            located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
-        let matching_words = MatchingWords::new(ctx, &query_terms);
+//     #[test]
+//     fn matching_words() {
+//         let temp_index = temp_index_with_documents(None);
+//         let rtxn = temp_index.read_txn().unwrap();
+//         let mut ctx = SearchContext::new(&temp_index, &rtxn).unwrap();
+//         let mut builder = TokenizerBuilder::default();
+//         let tokenizer = builder.build();
+//         let text = "split this world";
+//         let tokens = tokenizer.tokenize(text);
+//         let ExtractedTokens { query_terms, .. } =
+//             located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
+//         let matching_words = MatchingWords::new(ctx, &query_terms);

-        assert_eq!(
-            matching_words.get_matches_and_query_positions(
-                &[
-                    Token {
-                        kind: TokenKind::Word,
-                        lemma: Cow::Borrowed("split"),
-                        char_end: "split".chars().count(),
-                        byte_end: "split".len(),
-                        ..Default::default()
-                    },
-                    Token {
-                        kind: TokenKind::Word,
-                        lemma: Cow::Borrowed("nyc"),
-                        char_end: "nyc".chars().count(),
-                        byte_end: "nyc".len(),
-                        ..Default::default()
-                    },
-                    Token {
-                        kind: TokenKind::Word,
-                        lemma: Cow::Borrowed("world"),
-                        char_end: "world".chars().count(),
-                        byte_end: "world".len(),
-                        ..Default::default()
-                    },
-                    Token {
-                        kind: TokenKind::Word,
-                        lemma: Cow::Borrowed("worlded"),
-                        char_end: "worlded".chars().count(),
-                        byte_end: "worlded".len(),
-                        ..Default::default()
-                    },
-                    Token {
-                        kind: TokenKind::Word,
-                        lemma: Cow::Borrowed("thisnew"),
-                        char_end: "thisnew".chars().count(),
-                        byte_end: "thisnew".len(),
-                        ..Default::default()
-                    }
-                ],
-                text
-            ),
-            (
-                vec![
-                    Match {
-                        char_count: 5,
-                        byte_len: 5,
-                        position: MatchPosition::Word { word_position: 0, token_position: 0 }
-                    },
-                    Match {
-                        char_count: 5,
-                        byte_len: 5,
-                        position: MatchPosition::Word { word_position: 2, token_position: 2 }
-                    },
-                    Match {
-                        char_count: 5,
-                        byte_len: 5,
-                        position: MatchPosition::Word { word_position: 3, token_position: 3 }
-                    }
-                ],
-                vec![
-                    QueryPosition { range: [0, 0], index: 0 },
-                    QueryPosition { range: [2, 2], index: 1 },
-                    QueryPosition { range: [2, 2], index: 2 }
-                ]
-            )
-        );
-    }
-}
+//         assert_eq!(
+//             matching_words.get_matches_and_query_positions(
+//                 &[
+//                     Token {
+//                         kind: TokenKind::Word,
+//                         lemma: Cow::Borrowed("split"),
+//                         char_end: "split".chars().count(),
+//                         byte_end: "split".len(),
+//                         ..Default::default()
+//                     },
+//                     Token {
+//                         kind: TokenKind::Word,
+//                         lemma: Cow::Borrowed("nyc"),
+//                         char_end: "nyc".chars().count(),
+//                         byte_end: "nyc".len(),
+//                         ..Default::default()
+//                     },
+//                     Token {
+//                         kind: TokenKind::Word,
+//                         lemma: Cow::Borrowed("world"),
+//                         char_end: "world".chars().count(),
+//                         byte_end: "world".len(),
+//                         ..Default::default()
+//                     },
+//                     Token {
+//                         kind: TokenKind::Word,
+//                         lemma: Cow::Borrowed("worlded"),
+//                         char_end: "worlded".chars().count(),
+//                         byte_end: "worlded".len(),
+//                         ..Default::default()
+//                     },
+//                     Token {
+//                         kind: TokenKind::Word,
+//                         lemma: Cow::Borrowed("thisnew"),
+//                         char_end: "thisnew".chars().count(),
+//                         byte_end: "thisnew".len(),
+//                         ..Default::default()
+//                     }
+//                 ],
+//                 text
+//             ),
+//             (
+//                 vec![
+//                     Match {
+//                         char_count: 5,
+//                         byte_len: 5,
+//                         position: MatchPosition::Word { word_position: 0, token_position: 0 }
+//                     },
+//                     Match {
+//                         char_count: 5,
+//                         byte_len: 5,
+//                         position: MatchPosition::Word { word_position: 2, token_position: 2 }
+//                     },
+//                     Match {
+//                         char_count: 5,
+//                         byte_len: 5,
+//                         position: MatchPosition::Word { word_position: 3, token_position: 3 }
+//                     }
+//                 ],
+//                 vec![
+//                     QueryPosition { range: [0, 0], index: 0 },
+//                     QueryPosition { range: [2, 2], index: 1 },
+//                     QueryPosition { range: [2, 2], index: 2 }
+//                 ]
+//             )
+//         );
+//     }
+// }
--- a/crates/milli/src/search/new/matches/mod.rs
+++ b/crates/milli/src/search/new/matches/mod.rs
@ -48,10 +48,9 @@ impl<'a> MatcherBuilder<'a> {
    }
 }

-#[derive(Copy, Clone, Default, Debug)]
+#[derive(Copy, Clone, Default)]
 pub struct FormatOptions {
    pub highlight: bool,
-    // TODO: Should this be usize?
    pub crop: Option<usize>,
 }

@ -80,7 +79,9 @@ impl Matcher<'_, '_, '_, '_> {
    /// TODO: description
    pub fn get_match_bounds(
        &mut self,
-        // TODO: Add option to count grapheme clusters instead of bytes
+        // TODO: Add option to count UTF-16 segments, or whatever JS works with when slicing strings
+        // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String#utf-16_characters_unicode_code_points_and_grapheme_clusters
+        // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/slice
        format_options: Option<FormatOptions>,
    ) -> Option<MatchBounds> {
        if self.text.is_empty() {
@ -152,7 +153,6 @@ mod tests {
    use crate::index::tests::TempIndex;
    use crate::{execute_search, filtered_universe, SearchContext, TimeBudget};
    use charabia::TokenizerBuilder;
-    use memmap2::Mmap;

    impl<'a> MatcherBuilder<'a> {
        fn new_test(rtxn: &'a heed::RoTxn<'a>, index: &'a TempIndex, query: &str) -> Self {
@ -180,10 +180,9 @@ mod tests {
            .unwrap();

            // consume context and located_query_terms to build MatchingWords.
-            let matching_words = match located_query_terms {
-                Some(located_query_terms) => MatchingWords::new(ctx, &located_query_terms),
-                None => MatchingWords::default(),
-            };
+            let matching_words = located_query_terms
+                .map(|located_query_terms| MatchingWords::new(ctx, &located_query_terms))
+                .unwrap_or_default();

            MatcherBuilder::new(
                matching_words,
@ -197,283 +196,401 @@ mod tests {
        }
    }

-    pub fn temp_index_with_documents(documents: Option<Mmap>) -> TempIndex {
+    pub fn rename_me(
+        format_options: Option<FormatOptions>,
+        text: &str,
+        query: &str,
+        expected_text: &str,
+    ) {
        let temp_index = TempIndex::new();

+        // document will always contain the same exact text normally
+        // TODO: Describe this better and ask if this is actually the case
        temp_index
-            .add_documents(documents.unwrap_or_else(|| {
-                documents!([
-                    { "id": 1, "name": "split this world westfali westfalia the Ŵôřlḑôle" },
-                    { "id": 2, "name": "Westfália" },
-                    { "id": 3, "name": "Ŵôřlḑôle" },
-                ])
-            }))
+            .add_documents(documents!([
+                { "id": 1, "text": text.to_string() },
+            ]))
            .unwrap();

-        temp_index
-    }
-
-    fn get_expected_maybe_text(expected_text: &str, text: &str) -> Option<String> {
-        if expected_text == text {
-            None
-        } else {
-            Some(expected_text.to_string())
-        }
-    }
-
-    #[test]
-    fn format_identity() {
-        let temp_index = temp_index_with_documents(None);
        let rtxn = temp_index.read_txn().unwrap();
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
-        let format_options = Some(FormatOptions { highlight: false, crop: None });
-
-        let test_values = [
-            // Text without any match.
-            "A quick brown fox can not jump 32 feet, right? Brr, it is cold!", 
-            // Text containing all matches.
-            "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", 
-            // Text containing some matches.
-            "Natalie risk her future to build a world with the boy she loves."
-        ];
-
-        for text in test_values {
-            let mut matcher = builder.build(text, None);
-            // no crop and no highlight should return complete text.
-            assert_eq!(matcher.get_formatted_text(format_options), None);
-        }
-    }
-
-    #[test]
-    fn format_highlight() {
-        let temp_index = temp_index_with_documents(None);
-        let rtxn = temp_index.read_txn().unwrap();
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
-        let format_options = Some(FormatOptions { highlight: true, crop: None });
-
-        let test_values = [
-            // empty text.
-            ["", ""],
-            // text containing only separators.
-            [":-)", ":-)"],
-            // Text without any match.
-            ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
-             "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"],
-            // Text containing all matches.
-            ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", 
-             "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
-            // Text containing some matches.
-            ["Natalie risk her future to build a world with the boy she loves.",
-             "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."],
-        ];
-
-        for [text, expected_text] in test_values {
-            let mut matcher = builder.build(text, None);
-            // no crop should return complete text with highlighted matches.
-            assert_eq!(
-                matcher.get_formatted_text(format_options),
-                get_expected_maybe_text(expected_text, text)
-            );
-        }
-    }
-
-    #[test]
-    fn highlight_unicode() {
-        let temp_index = temp_index_with_documents(None);
-        let rtxn = temp_index.read_txn().unwrap();
-        let format_options = Some(FormatOptions { highlight: true, crop: None });
-
-        let test_values = [
-            // Text containing prefix match.
-            ["world", "Ŵôřlḑôle", "<em>Ŵôřlḑ</em>ôle"],
-            // Text containing unicode match.
-            ["world", "Ŵôřlḑ", "<em>Ŵôřlḑ</em>"],
-            // Text containing unicode match.
-            ["westfali", "Westfália", "<em>Westfáli</em>a"],
-        ];
-
-        for [query, text, expected_text] in test_values {
-            let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
-            let mut matcher = builder.build(text, None);
-            // no crop should return complete text with highlighted matches.
-            assert_eq!(
-                matcher.get_formatted_text(format_options),
-                get_expected_maybe_text(expected_text, text)
-            );
-        }
-    }
-
-    #[test]
-    fn format_crop() {
-        let temp_index = temp_index_with_documents(None);
-        let rtxn = temp_index.read_txn().unwrap();
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
-        let format_options = Some(FormatOptions { highlight: false, crop: Some(10) });
-
-        let test_values = [
-            // empty text.
-            ["", ""],
-            // text containing only separators.
-            [":-)", ":-)"],
-            // Text without any match.
-            ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
-             "A quick brown fox can not jump 32 feet, right…"],
-            // Text without any match starting by a separator.
-            ["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)",
-             "(A quick brown fox can not jump 32 feet, right…" ],
-            // Test phrase propagation
-            ["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.",
-             "…Split The World is a book written by Emily Henry…"],
-            // Text containing some matches.
-            ["Natalie risk her future to build a world with the boy she loves.",
-             "…future to build a world with the boy she loves."],
-            // Text containing all matches.
-            ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
-             "…she loves. Emily Henry: The Love That Split The World."],
-            // Text containing a match unordered and a match ordered.
-            ["The world split void void void void void void void void void split the world void void",
-             "…void void void void void split the world void void"],
-            // Text containing matches with different density.
-            ["split void the void void world void void void void void void void void void void split the world void void",
-             "…void void void void void split the world void void"],
-            ["split split split split split split void void void void void void void void void void split the world void void",
-             "…void void void void void split the world void void"]
-        ];
-
-        for [text, expected_text] in test_values {
-            let mut matcher = builder.build(text, None);
-            // no crop should return complete text with highlighted matches.
-            assert_eq!(
-                matcher.get_formatted_text(format_options),
-                get_expected_maybe_text(expected_text, text)
-            );
-        }
-    }
-
-    #[test]
-    fn format_highlight_crop() {
-        let temp_index = temp_index_with_documents(None);
-        let rtxn = temp_index.read_txn().unwrap();
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
-        let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
-
-        let test_values = [
-            // empty text.
-            ["", ""],
-            // text containing only separators.
-            [":-)", ":-)"],
-            // Text without any match.
-            ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!", 
-             "A quick brown fox can not jump 32 feet, right…"],
-            // Text containing some matches.
-            ["Natalie risk her future to build a world with the boy she loves.", 
-             "…future to build a <em>world</em> with <em>the</em> boy she loves."],
-            // Text containing all matches.
-            ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.", 
-             "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
-            // Text containing a match unordered and a match ordered.
-            ["The world split void void void void void void void void void split the world void void", 
-             "…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"]
-        ];
-
-        for [text, expected_text] in test_values {
-            let mut matcher = builder.build(text, None);
-            // no crop should return complete text with highlighted matches.
-            assert_eq!(
-                matcher.get_formatted_text(format_options),
-                get_expected_maybe_text(expected_text, text)
-            );
-        }
-    }
-
-    #[test]
-    fn format_highlight_crop_phrase_query() {
-        //! testing: https://github.com/meilisearch/meilisearch/issues/3975
-        let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
-        let temp_index = temp_index_with_documents(Some(documents!([
-            { "id": 1, "text": text }
-        ])));
-        let rtxn = temp_index.read_txn().unwrap();
-
-        let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
-
-        let test_values = [
-            // should return 10 words with a marker at the start as well the end, and the highlighted matches.
-            ["\"the world\"",
-             "…the power to split <em>the world</em> between those who embraced…"],
-            // should highlight "those" and the phrase "and those".
-            ["those \"and those\"", 
-             "…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"],
-            ["\"The groundbreaking invention had the power to split the world\"", 
-             "<em>The groundbreaking invention had the power to split the world</em>…"],
-            ["\"The groundbreaking invention had the power to split the world between those\"", 
-             "<em>The groundbreaking invention had the power to split the world</em>…"],
-            ["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"", 
-             "…between those who <em>embraced progress and those who resisted change</em>!"],
-            ["\"groundbreaking invention\" \"split the world between\"",
-             "…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"],
-            ["\"groundbreaking invention\" \"had the power to split the world between those\"", 
-             "…<em>invention</em> <em>had the power to split the world between those</em>…"],
-        ];
-
-        for [query, expected_text] in test_values {
-            let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
-            let mut matcher = builder.build(text, None);
-
-            assert_eq!(
-                matcher.get_formatted_text(format_options),
-                get_expected_maybe_text(expected_text, text)
-            );
-        }
-    }
-
-    #[test]
-    fn smaller_crop_size() {
-        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
-        let temp_index = temp_index_with_documents(None);
-        let rtxn = temp_index.read_txn().unwrap();
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
-        let text = "void void split the world void void.";
+        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
        let mut matcher = builder.build(text, None);

-        let test_values = [
-            // set a smaller crop size
-            // because crop size < query size, partially format matches.
-            (2, "…split the…"),
-            // set a smaller crop size
-            // because crop size < query size, partially format matches.
-            (1, "…split…"),
-            // set  crop size to 0
-            // because crop size is 0, crop is ignored.
-            (0, "void void split the world void void."),
-        ];
-
-        for (crop_size, expected_text) in test_values {
-            // set a smaller crop size
-            let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) });
-            assert_eq!(
-                matcher.get_formatted_text(format_options),
-                get_expected_maybe_text(expected_text, text)
-            );
-        }
+        assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
    }

-    #[test]
-    fn partial_matches() {
-        let temp_index = temp_index_with_documents(None);
-        let rtxn = temp_index.read_txn().unwrap();
-        let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
-
-        let format_options = Some(FormatOptions { highlight: true, crop: None });
-
-        let text = "the do or die can't be he do and or isn't he";
-        let mut matcher = builder.build(text, None);
-        assert_eq!(
-            matcher.get_formatted_text(format_options),
-            Some(
-                "<em>the</em> <em>do or</em> die can't be he do and or isn'<em>t he</em>"
-                    .to_string()
-            )
+    /// "Dei store fiskane eta dei små — dei liger under som minst förmå."
+    ///
+    /// (Men are like fish; the great ones devour the small.)
+    fn rename_me_with_base_text(
+        format_options: Option<FormatOptions>,
+        query: &str,
+        expected_text: &str,
+    ) {
+        rename_me(
+            format_options,
+            "Dei store fiskane eta dei små — dei liger under som minst förmå.",
+            query,
+            expected_text,
        );
    }
+
+    #[test]
+    fn phrase_highlight_bigger_than_crop() {
+        rename_me_with_base_text(
+            Some(FormatOptions { highlight: true, crop: Some(1) }),
+            "\"dei liger\"",
+            "…<em>dei</em>…",
+        );
+    }
+
+    #[test]
+    fn phrase_highlight_same_size_as_crop() {
+        rename_me_with_base_text(
+            Some(FormatOptions { highlight: true, crop: Some(2) }),
+            "\"dei liger\"",
+            "…<em>dei liger</em>…",
+        );
+    }
+
+    #[test]
+    fn phrase_highlight_crop_middle() {
+        rename_me_with_base_text(
+            Some(FormatOptions { highlight: true, crop: Some(4) }),
+            "\"dei liger\"",
+            "…små — <em>dei liger</em> under…",
+        );
+    }
+
+    #[test]
+    fn phrase_highlight_crop_end() {
+        rename_me_with_base_text(
+            Some(FormatOptions { highlight: true, crop: Some(4) }),
+            "\"minst förmå\"",
+            "…under som <em>minst förmå</em>.",
+        );
+    }
+
+    #[test]
+    fn phrase_highlight_crop_beginning() {
+        rename_me_with_base_text(
+            Some(FormatOptions { highlight: true, crop: Some(4) }),
+            "\"Dei store\"",
+            "<em>Dei store</em> fiskane eta…",
+        );
+    }
+
+    #[test]
+    fn highlight_end() {
+        rename_me_with_base_text(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "minst förmå",
+            "Dei store fiskane eta dei små — dei liger under som <em>minst</em> <em>förmå</em>.",
+        );
+    }
+
+    #[test]
+    fn highlight_beginning_and_middle() {
+        rename_me_with_base_text(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "Dei store",
+            "<em>Dei</em> <em>store</em> fiskane eta <em>dei</em> små — <em>dei</em> liger under som minst förmå.",
+        );
+    }
+
+    #[test]
+    fn partial_match_middle() {
+        // TODO: Is this intentional?
+        // Here the only interned word is "forma", hence it cannot find the searched prefix
+        // word "fo" inside "forma" within milli::search::new::matches::matching_words::MatchingWords::try_get_word_match
+        // `milli::search::new::query_term::QueryTerm::all_computed_derivations` might be at fault here
+
+        // interned words = ["forma"]
+        rename_me(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "altså, förmå, på en måte",
+            "fo",
+            "altså, <em>förmå</em>, på en måte",
+        );
+
+        // interned words = ["fo", "forma"]
+        rename_me(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "altså, fo förmå, på en måte",
+            "fo",
+            "altså, <em>fo</em> <em>fö</em>rmå, på en måte",
+        );
+    }
+
+    #[test]
+    fn partial_match_end() {
+        rename_me(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "förmå, på en måte",
+            "fo",
+            "<em>förmå</em>, på en måte",
+        );
+
+        rename_me(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "fo förmå, på en måte",
+            "fo",
+            "<em>fo</em> <em>fö</em>rmå, på en måte",
+        );
+    }
+
+    #[test]
+    fn partial_match_beginning() {
+        rename_me(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "altså, förmå",
+            "fo",
+            "altså, <em>förmå</em>",
+        );
+
+        rename_me(
+            Some(FormatOptions { highlight: true, crop: None }),
+            "altså, fo förmå",
+            "fo",
+            "altså, <em>fo</em> <em>fö</em>rmå",
+        );
+    }
+
+    // #[test]
+    // fn format_identity() {
+    //     let temp_index = temp_index_with_documents(None);
+    //     let rtxn = temp_index.read_txn().unwrap();
+    //     let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
+    //     let format_options = Some(FormatOptions { highlight: false, crop: None });
+
+    //     let test_values = [
+    //         // Text without any match.
+    //         "A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
+    //         // Text containing all matches.
+    //         "Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
+    //         // Text containing some matches.
+    //         "Natalie risk her future to build a world with the boy she loves."
+    //     ];
+
+    //     for text in test_values {
+    //         let mut matcher = builder.build(text, None);
+    //         // no crop and no highlight should return complete text.
+    //         assert_eq!(matcher.get_formatted_text(format_options), None);
+    //     }
+    // }
+
+    // #[test]
+    // fn format_highlight() {
+    //     let temp_index = temp_index_with_documents(None);
+    //     let rtxn = temp_index.read_txn().unwrap();
+    //     let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
+    //     let format_options = Some(FormatOptions { highlight: true, crop: None });
+
+    //     let test_values = [
+    //         // empty text.
+    //         ["", ""],
+    //         // text containing only separators.
+    //         [":-)", ":-)"],
+    //         // Text without any match.
+    //         ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
+    //          "A quick brown fox can not jump 32 feet, right? Brr, it is cold!"],
+    //         // Text containing all matches.
+    //         ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
+    //          "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
+    //         // Text containing some matches.
+    //         ["Natalie risk her future to build a world with the boy she loves.",
+    //          "Natalie risk her future to build a <em>world</em> with <em>the</em> boy she loves."],
+    //     ];
+
+    //     for [text, expected_text] in test_values {
+    //         let mut matcher = builder.build(text, None);
+    //         // no crop should return complete text with highlighted matches.
+    //         assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
+    //     }
+    // }
+
+    // #[test]
+    // fn highlight_unicode() {
+    //     let temp_index = temp_index_with_documents(None);
+    //     let rtxn = temp_index.read_txn().unwrap();
+    //     let format_options = Some(FormatOptions { highlight: true, crop: None });
+
+    //     let test_values = [
+    //         // Text containing prefix match.
+    //         ["world", "Ŵôřlḑôle", "<em>Ŵôřlḑ</em>ôle"],
+    //         // Text containing unicode match.
+    //         ["world", "Ŵôřlḑ", "<em>Ŵôřlḑ</em>"],
+    //         // Text containing unicode match.
+    //         ["westfali", "Westfália", "<em>Westfáli</em>a"],
+    //     ];
+
+    //     for [query, text, expected_text] in test_values {
+    //         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
+    //         let mut matcher = builder.build(text, None);
+    //         // no crop should return complete text with highlighted matches.
+    //         assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
+    //     }
+    // }
+
+    // #[test]
+    // fn format_crop() {
+    //     let temp_index = temp_index_with_documents(None);
+    //     let rtxn = temp_index.read_txn().unwrap();
+    //     let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
+    //     let format_options = Some(FormatOptions { highlight: false, crop: Some(10) });
+
+    //     let test_values = [
+    //         // empty text.
+    //         // ["", ""],
+    //         // text containing only separators.
+    //         // [":-)", ":-)"],
+    //         // Text without any match.
+    //         ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
+    //          "A quick brown fox can not jump 32 feet, right…"],
+    //         // Text without any match starting by a separator.
+    //         ["(A quick brown fox can not jump 32 feet, right? Brr, it is cold!)",
+    //          "(A quick brown fox can not jump 32 feet, right…" ],
+    //         // Test phrase propagation
+    //         ["Natalie risk her future. Split The World is a book written by Emily Henry. I never read it.",
+    //          "…Split The World is a book written by Emily Henry…"],
+    //         // Text containing some matches.
+    //         ["Natalie risk her future to build a world with the boy she loves.",
+    //          "…future to build a world with the boy she loves."],
+    //         // Text containing all matches.
+    //         ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
+    //          "…she loves. Emily Henry: The Love That Split The World."],
+    //         // Text containing a match unordered and a match ordered.
+    //         ["The world split void void void void void void void void void split the world void void",
+    //          "…void void void void void split the world void void"],
+    //         // Text containing matches with different density.
+    //         ["split void the void void world void void void void void void void void void void split the world void void",
+    //          "…void void void void void split the world void void"],
+    //         ["split split split split split split void void void void void void void void void void split the world void void",
+    //          "…void void void void void split the world void void"]
+    //     ];
+
+    //     for [text, expected_text] in test_values {
+    //         let mut matcher = builder.build(text, None);
+    //         // no crop should return complete text with highlighted matches.
+    //         assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
+    //     }
+    // }
+
+    // #[test]
+    // fn format_highlight_crop() {
+    //     let temp_index = temp_index_with_documents(None);
+    //     let rtxn = temp_index.read_txn().unwrap();
+    //     let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
+    //     let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
+
+    //     let test_values = [
+    //         // empty text.
+    //         ["", ""],
+    //         // text containing only separators.
+    //         [":-)", ":-)"],
+    //         // Text without any match.
+    //         ["A quick brown fox can not jump 32 feet, right? Brr, it is cold!",
+    //          "A quick brown fox can not jump 32 feet, right…"],
+    //         // Text containing some matches.
+    //         ["Natalie risk her future to build a world with the boy she loves.",
+    //          "…future to build a <em>world</em> with <em>the</em> boy she loves."],
+    //         // Text containing all matches.
+    //         ["Natalie risk her future to build a world with the boy she loves. Emily Henry: The Love That Split The World.",
+    //          "…she loves. Emily Henry: <em>The</em> Love That <em>Split</em> <em>The</em> <em>World</em>."],
+    //         // Text containing a match unordered and a match ordered.
+    //         ["The world split void void void void void void void void void split the world void void",
+    //          "…void void void void void <em>split</em> <em>the</em> <em>world</em> void void"]
+    //     ];
+
+    //     for [text, expected_text] in test_values {
+    //         let mut matcher = builder.build(text, None);
+    //         // no crop should return complete text with highlighted matches.
+    //         assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
+    //     }
+    // }
+
+    // #[test]
+    // fn format_highlight_crop_phrase_query() {
+    //     //! testing: https://github.com/meilisearch/meilisearch/issues/3975
+    //     let text = "The groundbreaking invention had the power to split the world between those who embraced progress and those who resisted change!";
+    //     let temp_index = temp_index_with_documents(Some(documents!([
+    //         { "id": 1, "text": text }
+    //     ])));
+    //     let rtxn = temp_index.read_txn().unwrap();
+
+    //     let format_options = Some(FormatOptions { highlight: true, crop: Some(10) });
+
+    //     let test_values = [
+    //         // should return 10 words with a marker at the start as well the end, and the highlighted matches.
+    //         ["\"the world\"",
+    //          "…the power to split <em>the world</em> between those who embraced…"],
+    //         // should highlight "those" and the phrase "and those".
+    //         ["those \"and those\"",
+    //          "…world between <em>those</em> who embraced progress <em>and those</em> who resisted…"],
+    //         ["\"The groundbreaking invention had the power to split the world\"",
+    //          "<em>The groundbreaking invention had the power to split the world</em>…"],
+    //         ["\"The groundbreaking invention had the power to split the world between those\"",
+    //          "<em>The groundbreaking invention had the power to split the world</em>…"],
+    //         ["\"The groundbreaking invention\" \"embraced progress and those who resisted change!\"",
+    //          "…between those who <em>embraced progress and those who resisted change</em>!"],
+    //         ["\"groundbreaking invention\" \"split the world between\"",
+    //          "…<em>groundbreaking invention</em> had the power to <em>split the world between</em>…"],
+    //         ["\"groundbreaking invention\" \"had the power to split the world between those\"",
+    //          "…<em>invention</em> <em>had the power to split the world between those</em>…"],
+    //     ];
+
+    //     for [query, expected_text] in test_values {
+    //         let builder = MatcherBuilder::new_test(&rtxn, &temp_index, query);
+    //         let mut matcher = builder.build(text, None);
+
+    //         assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
+    //     }
+    // }
+
+    // #[test]
+    // fn smaller_crop_size() {
+    //     //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
+    //     let temp_index = temp_index_with_documents(None);
+    //     let rtxn = temp_index.read_txn().unwrap();
+    //     let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "split the world");
+    //     let text = "void void split the world void void.";
+    //     let mut matcher = builder.build(text, None);
+
+    //     let test_values = [
+    //         // set a smaller crop size
+    //         // because crop size < query size, partially format matches.
+    //         (2, "…split the…"),
+    //         // set a smaller crop size
+    //         // because crop size < query size, partially format matches.
+    //         (1, "…split…"),
+    //         // set  crop size to 0
+    //         // because crop size is 0, crop is ignored.
+    //         (0, "void void split the world void void."),
+    //     ];
+
+    //     for (crop_size, expected_text) in test_values {
+    //         // set a smaller crop size
+    //         let format_options = Some(FormatOptions { highlight: false, crop: Some(crop_size) });
+    //         assert_eq!(matcher.get_formatted_text(format_options), Some(expected_text.to_string()));
+    //     }
+    // }
+
+    // #[test]
+    // fn partial_matches() {
+    //     let temp_index = temp_index_with_documents(None);
+    //     let rtxn = temp_index.read_txn().unwrap();
+    //     let builder = MatcherBuilder::new_test(&rtxn, &temp_index, "the \"t he\" door \"do or\"");
+
+    //     let format_options = Some(FormatOptions { highlight: true, crop: None });
+
+    //     let text = "the do or die can't be he do and or isn't he";
+    //     let mut matcher = builder.build(text, None);
+    //     assert_eq!(
+    //         matcher.get_formatted_text(format_options),
+    //         Some(
+    //             "<em>the</em> <em>do or</em> die can't be he do and or isn'<em>t he</em>"
+    //                 .to_string()
+    //         )
+    //     );
+    // }
 }
--- a/crates/milli/src/search/new/query_term/mod.rs
+++ b/crates/milli/src/search/new/query_term/mod.rs
@ -489,8 +489,7 @@ impl QueryTerm {
        let mut words = BTreeSet::new();
        let mut phrases = BTreeSet::new();

-        let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
-            &self.zero_typo;
+        let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, .. } = &self.zero_typo;
        words.extend(zero_typo.iter().copied());
        words.extend(prefix_of.iter().copied());
        phrases.extend(phrase.iter().copied());