Merge branch 'search-refactor-highlighter' into search-refactor-highlighter-merged

2025-07-03 20:07:09 +02:00 · 2023-04-11 12:22:34 +02:00 · 2023-04-11 12:22:34 +02:00 · e7bb8c940f
commit e7bb8c940f
parent 8cb85294ef f7e7f438f8
8 changed files with 470 additions and 631 deletions
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -98,8 +98,8 @@ pub use self::heed_codec::{
 };
 pub use self::index::Index;
 pub use self::search::{
-    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord,
-    MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
+    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
+    SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
 };

 pub type Result<T> = std::result::Result<T, error::Error>;
--- a/milli/src/search/matches/matching_words.rs
+++ b/milli/src/search/matches/matching_words.rs
@ -1,458 +0,0 @@
-use std::cmp::{min, Reverse};
-use std::collections::BTreeMap;
-use std::fmt;
-use std::ops::{Index, IndexMut};
-use std::rc::Rc;
-
-use charabia::Token;
-use levenshtein_automata::{Distance, DFA};
-
-use crate::error::InternalError;
-use crate::search::build_dfa;
-use crate::MAX_WORD_LENGTH;
-
-type IsPrefix = bool;
-
-/// Structure created from a query tree
-/// referencing words that match the given query tree.
-#[derive(Default)]
-pub struct MatchingWords {
-    inner: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
-}
-
-impl fmt::Debug for MatchingWords {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        writeln!(f, "[")?;
-        for (matching_words, primitive_word_id) in self.inner.iter() {
-            writeln!(f, "({matching_words:?}, {primitive_word_id:?})")?;
-        }
-        writeln!(f, "]")?;
-        Ok(())
-    }
-}
-
-impl MatchingWords {
-    pub fn new(
-        mut matching_words: Vec<(Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)>,
-    ) -> crate::Result<Self> {
-        // if one of the matching_words vec doesn't contain a word.
-        if matching_words.iter().any(|(mw, _)| mw.is_empty()) {
-            return Err(InternalError::InvalidMatchingWords.into());
-        }
-
-        // Sort word by len in DESC order prioritizing the longuest matches,
-        // in order to highlight the longuest part of the matched word.
-        matching_words.sort_unstable_by_key(|(mw, _)| Reverse((mw.len(), mw[0].word.len())));
-
-        Ok(Self { inner: matching_words })
-    }
-
-    /// Returns an iterator over terms that match or partially match the given token.
-    pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
-        MatchesIter { inner: Box::new(self.inner.iter()), token }
-    }
-}
-
-/// Iterator over terms that match the given token,
-/// This allow to lazily evaluate matches.
-pub struct MatchesIter<'a, 'b> {
-    #[allow(clippy::type_complexity)]
-    inner: Box<dyn Iterator<Item = &'a (Vec<Rc<MatchingWord>>, Vec<PrimitiveWordId>)> + 'a>,
-    token: &'b Token<'b>,
-}
-
-impl<'a> Iterator for MatchesIter<'a, '_> {
-    type Item = MatchType<'a>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        match self.inner.next() {
-            Some((matching_words, ids)) => match matching_words[0].match_token(self.token) {
-                Some(char_len) => {
-                    if matching_words.len() > 1 {
-                        Some(MatchType::Partial(PartialMatch {
-                            matching_words: &matching_words[1..],
-                            ids,
-                            char_len,
-                        }))
-                    } else {
-                        Some(MatchType::Full { char_len, ids })
-                    }
-                }
-                None => self.next(),
-            },
-            None => None,
-        }
-    }
-}
-
-/// Id of a matching term corespounding to a word written by the end user.
-pub type PrimitiveWordId = u8;
-
-/// Structure used to match a specific term.
-pub struct MatchingWord {
-    pub dfa: DFA,
-    pub word: String,
-    pub typo: u8,
-    pub prefix: IsPrefix,
-}
-
-impl fmt::Debug for MatchingWord {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("MatchingWord")
-            .field("word", &self.word)
-            .field("typo", &self.typo)
-            .field("prefix", &self.prefix)
-            .finish()
-    }
-}
-
-impl PartialEq for MatchingWord {
-    fn eq(&self, other: &Self) -> bool {
-        self.prefix == other.prefix && self.typo == other.typo && self.word == other.word
-    }
-}
-
-impl MatchingWord {
-    pub fn new(word: String, typo: u8, prefix: IsPrefix) -> Option<Self> {
-        if word.len() > MAX_WORD_LENGTH {
-            return None;
-        }
-        let dfa = build_dfa(&word, typo, prefix);
-
-        Some(Self { dfa, word, typo, prefix })
-    }
-
-    /// Returns the lenght in chars of the match in case of the token matches the term.
-    pub fn match_token(&self, token: &Token) -> Option<usize> {
-        match self.dfa.eval(token.lemma()) {
-            Distance::Exact(t) if t <= self.typo => {
-                if self.prefix {
-                    let len = bytes_to_highlight(token.lemma(), &self.word);
-                    Some(token.original_lengths(len).0)
-                } else {
-                    Some(token.original_lengths(token.lemma().len()).0)
-                }
-            }
-            _otherwise => None,
-        }
-    }
-}
-
-/// A given token can partially match a query word for several reasons:
-/// - split words
-/// - multi-word synonyms
-/// In these cases we need to match consecutively several tokens to consider that the match is full.
-#[derive(Debug, PartialEq)]
-pub enum MatchType<'a> {
-    Full { char_len: usize, ids: &'a [PrimitiveWordId] },
-    Partial(PartialMatch<'a>),
-}
-
-/// Structure helper to match several tokens in a row in order to complete a partial match.
-#[derive(Debug, PartialEq)]
-pub struct PartialMatch<'a> {
-    matching_words: &'a [Rc<MatchingWord>],
-    ids: &'a [PrimitiveWordId],
-    char_len: usize,
-}
-
-impl<'a> PartialMatch<'a> {
-    /// Returns:
-    /// - None if the given token breaks the partial match
-    /// - Partial if the given token matches the partial match but doesn't complete it
-    /// - Full if the given token completes the partial match
-    pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
-        self.matching_words[0].match_token(token).map(|char_len| {
-            if self.matching_words.len() > 1 {
-                MatchType::Partial(PartialMatch {
-                    matching_words: &self.matching_words[1..],
-                    ids: self.ids,
-                    char_len,
-                })
-            } else {
-                MatchType::Full { char_len, ids: self.ids }
-            }
-        })
-    }
-
-    pub fn char_len(&self) -> usize {
-        self.char_len
-    }
-}
-
-// A simple wrapper around vec so we can get contiguous but index it like it's 2D array.
-struct N2Array<T> {
-    y_size: usize,
-    buf: Vec<T>,
-}
-
-impl<T: Clone> N2Array<T> {
-    fn new(x: usize, y: usize, value: T) -> N2Array<T> {
-        N2Array { y_size: y, buf: vec![value; x * y] }
-    }
-}
-
-impl<T> Index<(usize, usize)> for N2Array<T> {
-    type Output = T;
-
-    #[inline]
-    fn index(&self, (x, y): (usize, usize)) -> &T {
-        &self.buf[(x * self.y_size) + y]
-    }
-}
-
-impl<T> IndexMut<(usize, usize)> for N2Array<T> {
-    #[inline]
-    fn index_mut(&mut self, (x, y): (usize, usize)) -> &mut T {
-        &mut self.buf[(x * self.y_size) + y]
-    }
-}
-
-/// Returns the number of **bytes** we want to highlight in the `source` word.
-/// Basically we want to highlight as much characters as possible in the source until it has too much
-/// typos (= 2)
-/// The algorithm is a modified
-/// [Damerau-Levenshtein](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance)
-fn bytes_to_highlight(source: &str, target: &str) -> usize {
-    let n = source.chars().count();
-    let m = target.chars().count();
-
-    if n == 0 {
-        return 0;
-    }
-    // since we allow two typos we can send two characters even if it's completely wrong
-    if m < 3 {
-        return source.chars().take(m).map(|c| c.len_utf8()).sum();
-    }
-    if n == m && source == target {
-        return source.len();
-    }
-
-    let inf = n + m;
-    let mut matrix = N2Array::new(n + 2, m + 2, 0);
-
-    matrix[(0, 0)] = inf;
-    for i in 0..=n {
-        matrix[(i + 1, 0)] = inf;
-        matrix[(i + 1, 1)] = i;
-    }
-    for j in 0..=m {
-        matrix[(0, j + 1)] = inf;
-        matrix[(1, j + 1)] = j;
-    }
-
-    let mut last_row = BTreeMap::new();
-
-    for (row, char_s) in source.chars().enumerate() {
-        let mut last_match_col = 0;
-        let row = row + 1;
-
-        for (col, char_t) in target.chars().enumerate() {
-            let col = col + 1;
-            let last_match_row = *last_row.get(&char_t).unwrap_or(&0);
-            let cost = usize::from(char_s != char_t);
-
-            let dist_add = matrix[(row, col + 1)] + 1;
-            let dist_del = matrix[(row + 1, col)] + 1;
-            let dist_sub = matrix[(row, col)] + cost;
-            let dist_trans = matrix[(last_match_row, last_match_col)]
-                + (row - last_match_row - 1)
-                + 1
-                + (col - last_match_col - 1);
-            let dist = min(min(dist_add, dist_del), min(dist_sub, dist_trans));
-            matrix[(row + 1, col + 1)] = dist;
-
-            if cost == 0 {
-                last_match_col = col;
-            }
-        }
-
-        last_row.insert(char_s, row);
-    }
-
-    let mut minimum = (u32::max_value(), 0);
-    for x in 0..=m {
-        let dist = matrix[(n + 1, x + 1)] as u32;
-        if dist < minimum.0 {
-            minimum = (dist, x);
-        }
-    }
-
-    // everything was done characters wise and now we want to returns a number of bytes
-    source.chars().take(minimum.1).map(|c| c.len_utf8()).sum()
-}
-
-#[cfg(test)]
-mod tests {
-    use std::borrow::Cow;
-    use std::str::from_utf8;
-
-    use charabia::TokenKind;
-
-    use super::*;
-    use crate::MatchingWords;
-
-    #[test]
-    fn test_bytes_to_highlight() {
-        struct TestBytesToHighlight {
-            query: &'static str,
-            text: &'static str,
-            length: usize,
-        }
-        let tests = [
-            TestBytesToHighlight { query: "bip", text: "bip", length: "bip".len() },
-            TestBytesToHighlight { query: "bip", text: "boup", length: "bip".len() },
-            TestBytesToHighlight {
-                query: "Levenshtein",
-                text: "Levenshtein",
-                length: "Levenshtein".len(),
-            },
-            // we get to the end of our word with only one typo
-            TestBytesToHighlight {
-                query: "Levenste",
-                text: "Levenshtein",
-                length: "Levenste".len(),
-            },
-            // we get our third and last authorized typo right on the last character
-            TestBytesToHighlight {
-                query: "Levenstein",
-                text: "Levenshte",
-                length: "Levenste".len(),
-            },
-            // we get to the end of our word with only two typos at the beginning
-            TestBytesToHighlight {
-                query: "Bavenshtein",
-                text: "Levenshtein",
-                length: "Bavenshtein".len(),
-            },
-            TestBytesToHighlight {
-                query: "Альфа", text: "Альфой", length: "Альф".len()
-            },
-            TestBytesToHighlight {
-                query: "Go💼", text: "Go💼od luck.", length: "Go💼".len()
-            },
-            TestBytesToHighlight {
-                query: "Go💼od", text: "Go💼od luck.", length: "Go💼od".len()
-            },
-            TestBytesToHighlight {
-                query: "chäräcters",
-                text: "chäräcters",
-                length: "chäräcters".len(),
-            },
-            TestBytesToHighlight { query: "ch", text: "chäräcters", length: "ch".len() },
-            TestBytesToHighlight { query: "chär", text: "chäräcters", length: "chär".len() },
-        ];
-
-        for test in &tests {
-            let length = bytes_to_highlight(test.text, test.query);
-            assert_eq!(length, test.length, r#"lenght between: "{}" "{}""#, test.query, test.text);
-            assert!(
-                from_utf8(&test.query.as_bytes()[..length]).is_ok(),
-                r#"converting {}[..{}] to an utf8 str failed"#,
-                test.query,
-                length
-            );
-        }
-    }
-
-    #[test]
-    fn matching_words() {
-        let all = vec![
-            Rc::new(MatchingWord::new("split".to_string(), 1, true).unwrap()),
-            Rc::new(MatchingWord::new("this".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
-        ];
-        let matching_words = vec![
-            (vec![all[0].clone()], vec![0]),
-            (vec![all[1].clone()], vec![1]),
-            (vec![all[2].clone()], vec![2]),
-        ];
-
-        let matching_words = MatchingWords::new(matching_words).unwrap();
-
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("word"),
-                    char_end: "word".chars().count(),
-                    byte_end: "word".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_len: 3, ids: &[2] })
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("nyc"),
-                    char_end: "nyc".chars().count(),
-                    byte_end: "nyc".len(),
-                    ..Default::default()
-                })
-                .next(),
-            None
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("world"),
-                    char_end: "world".chars().count(),
-                    byte_end: "world".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_len: 5, ids: &[2] })
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("splitted"),
-                    char_end: "splitted".chars().count(),
-                    byte_end: "splitted".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_len: 5, ids: &[0] })
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("thisnew"),
-                    char_end: "thisnew".chars().count(),
-                    byte_end: "thisnew".len(),
-                    ..Default::default()
-                })
-                .next(),
-            None
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("borld"),
-                    char_end: "borld".chars().count(),
-                    byte_end: "borld".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_len: 5, ids: &[2] })
-        );
-        assert_eq!(
-            matching_words
-                .match_token(&Token {
-                    kind: TokenKind::Word,
-                    lemma: Cow::Borrowed("wordsplit"),
-                    char_end: "wordsplit".chars().count(),
-                    byte_end: "wordsplit".len(),
-                    ..Default::default()
-                })
-                .next(),
-            Some(MatchType::Full { char_len: 4, ids: &[2] })
-        );
-    }
-}
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -5,9 +5,8 @@ use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;

 pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
-pub use self::matches::{
-    FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
-};
+pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
+use self::new::PartialSearchResult;
 use crate::{
    execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
 };
@ -19,7 +18,6 @@ static LEVDIST2: Lazy<LevBuilder> = Lazy::new(|| LevBuilder::new(2, true));

 pub mod facet;
 mod fst_utils;
-mod matches;
 pub mod new;

 pub struct Search<'a> {
@ -110,19 +108,28 @@ impl<'a> Search<'a> {

    pub fn execute(&self) -> Result<SearchResult> {
        let mut ctx = SearchContext::new(self.index, self.rtxn);
-        execute_search(
-            &mut ctx,
-            &self.query,
-            self.terms_matching_strategy,
-            self.exhaustive_number_hits,
-            &self.filter,
-            &self.sort_criteria,
-            self.offset,
-            self.limit,
-            Some(self.words_limit),
-            &mut DefaultSearchLogger,
-            &mut DefaultSearchLogger,
-        )
+        let PartialSearchResult { located_query_terms, candidates, documents_ids } =
+            execute_search(
+                &mut ctx,
+                &self.query,
+                self.terms_matching_strategy,
+                self.exhaustive_number_hits,
+                &self.filter,
+                &self.sort_criteria,
+                self.offset,
+                self.limit,
+                Some(self.words_limit),
+                &mut DefaultSearchLogger,
+                &mut DefaultSearchLogger,
+            )?;
+
+        // consume context and located_query_terms to build MatchingWords.
+        let matching_words = match located_query_terms {
+            Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
+            None => MatchingWords::default(),
+        };
+
+        Ok(SearchResult { matching_words, candidates, documents_ids })
    }
 }

--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@ -0,0 +1,377 @@
+use std::cmp::Reverse;
+use std::fmt;
+use std::ops::RangeInclusive;
+
+use charabia::Token;
+
+use super::super::interner::Interned;
+use super::super::query_term::{
+    Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
+};
+use super::super::{DedupInterner, Phrase};
+use crate::SearchContext;
+
+pub struct LocatedMatchingPhrase {
+    pub value: Interned<Phrase>,
+    pub positions: RangeInclusive<WordId>,
+}
+
+pub struct LocatedMatchingWords {
+    pub value: Vec<Interned<String>>,
+    pub positions: RangeInclusive<WordId>,
+    pub is_prefix: bool,
+    pub original_char_count: usize,
+}
+
+/// Structure created from a query tree
+/// referencing words that match the given query tree.
+#[derive(Default)]
+pub struct MatchingWords {
+    word_interner: DedupInterner<String>,
+    phrase_interner: DedupInterner<Phrase>,
+    phrases: Vec<LocatedMatchingPhrase>,
+    words: Vec<LocatedMatchingWords>,
+}
+
+/// Extract and centralize the different phrases and words to match stored in a QueryTerm.
+fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) {
+    let mut matching_words = Vec::new();
+    let mut matching_phrases = Vec::new();
+
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    let QueryTerm {
+        original: _,
+        is_multiple_words: _,
+        max_nbr_typos: _,
+        is_prefix: _,
+        zero_typo,
+        one_typo,
+        two_typo,
+    } = term;
+
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo;
+
+    // zero typo
+    if let Some(phrase) = phrase {
+        matching_phrases.push(*phrase);
+    }
+    if let Some(zero_typo) = zero_typo {
+        matching_words.push(*zero_typo);
+    }
+    for synonym in synonyms {
+        matching_phrases.push(*synonym);
+    }
+
+    // one typo
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo {
+        if let Some(split_words) = split_words {
+            matching_phrases.push(*split_words);
+        }
+        for one_typo in one_typo {
+            matching_words.push(*one_typo);
+        }
+    }
+
+    // two typos
+    // the structure is exhaustively extracted to ensure that no field is missing.
+    if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo {
+        for two_typos in two_typos {
+            matching_words.push(*two_typos);
+        }
+    }
+
+    (matching_phrases, matching_words)
+}
+
+impl MatchingWords {
+    pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
+        let mut phrases = Vec::new();
+        let mut words = Vec::new();
+
+        // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms
+        // and wrap them in dedicated structures.
+        for located_term in located_terms {
+            let term = ctx.term_interner.get(located_term.value);
+            let (matching_phrases, matching_words) = extract_matching_terms(term);
+
+            for matching_phrase in matching_phrases {
+                phrases.push(LocatedMatchingPhrase {
+                    value: matching_phrase,
+                    positions: located_term.positions.clone(),
+                });
+            }
+
+            words.push(LocatedMatchingWords {
+                value: matching_words,
+                positions: located_term.positions.clone(),
+                is_prefix: term.is_prefix,
+                original_char_count: ctx.word_interner.get(term.original).chars().count(),
+            });
+        }
+
+        // Sort word to put prefixes at the bottom prioritizing the exact matches.
+        words.sort_unstable_by_key(|lmw| (lmw.is_prefix, Reverse(lmw.positions.len())));
+
+        Self {
+            phrases,
+            words,
+            word_interner: ctx.word_interner,
+            phrase_interner: ctx.phrase_interner,
+        }
+    }
+
+    /// Returns an iterator over terms that match or partially match the given token.
+    pub fn match_token<'a, 'b>(&'a self, token: &'b Token<'b>) -> MatchesIter<'a, 'b> {
+        MatchesIter { matching_words: self, phrases: Box::new(self.phrases.iter()), token }
+    }
+
+    /// Try to match the token with one of the located_words.
+    fn match_unique_words<'a>(&'a self, token: &Token) -> Option<MatchType<'a>> {
+        for located_words in &self.words {
+            for word in &located_words.value {
+                let word = self.word_interner.get(*word);
+                // if the word is a prefix we match using starts_with.
+                if located_words.is_prefix && token.lemma().starts_with(word) {
+                    let Some((char_index, c)) = word.char_indices().take(located_words.original_char_count).last() else {
+                        continue;
+                    };
+                    let prefix_length = char_index + c.len_utf8();
+                    let char_len = token.original_lengths(prefix_length).0;
+                    let ids = &located_words.positions;
+                    return Some(MatchType::Full { char_len, ids });
+                // else we exact match the token.
+                } else if token.lemma() == word {
+                    let char_len = token.char_end - token.char_start;
+                    let ids = &located_words.positions;
+                    return Some(MatchType::Full { char_len, ids });
+                }
+            }
+        }
+
+        None
+    }
+}
+
+/// Iterator over terms that match the given token,
+/// This allow to lazily evaluate matches.
+pub struct MatchesIter<'a, 'b> {
+    matching_words: &'a MatchingWords,
+    phrases: Box<dyn Iterator<Item = &'a LocatedMatchingPhrase> + 'a>,
+    token: &'b Token<'b>,
+}
+
+impl<'a> Iterator for MatchesIter<'a, '_> {
+    type Item = MatchType<'a>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.phrases.next() {
+            // Try to match all the phrases first.
+            Some(located_phrase) => {
+                let phrase = self.matching_words.phrase_interner.get(located_phrase.value);
+
+                // create a PartialMatch struct to make it compute the first match
+                // instead of duplicating the code.
+                let ids = &located_phrase.positions;
+                // collect the references of words from the interner.
+                let words = phrase
+                    .words
+                    .iter()
+                    .map(|word| {
+                        word.map(|word| self.matching_words.word_interner.get(word).as_str())
+                    })
+                    .collect();
+                let partial = PartialMatch { matching_words: words, ids, char_len: 0 };
+
+                partial.match_token(self.token).or_else(|| self.next())
+            }
+            // If no phrases matches, try to match uiques words.
+            None => self.matching_words.match_unique_words(self.token),
+        }
+    }
+}
+
+/// Id of a matching term corespounding to a word written by the end user.
+pub type WordId = u16;
+
+/// A given token can partially match a query word for several reasons:
+/// - split words
+/// - multi-word synonyms
+/// In these cases we need to match consecutively several tokens to consider that the match is full.
+#[derive(Debug, PartialEq)]
+pub enum MatchType<'a> {
+    Full { char_len: usize, ids: &'a RangeInclusive<WordId> },
+    Partial(PartialMatch<'a>),
+}
+
+/// Structure helper to match several tokens in a row in order to complete a partial match.
+#[derive(Debug, PartialEq)]
+pub struct PartialMatch<'a> {
+    matching_words: Vec<Option<&'a str>>,
+    ids: &'a RangeInclusive<WordId>,
+    char_len: usize,
+}
+
+impl<'a> PartialMatch<'a> {
+    /// Returns:
+    /// - None if the given token breaks the partial match
+    /// - Partial if the given token matches the partial match but doesn't complete it
+    /// - Full if the given token completes the partial match
+    pub fn match_token(self, token: &Token) -> Option<MatchType<'a>> {
+        let Self { mut matching_words, ids, .. } = self;
+
+        let is_matching = match matching_words.first()? {
+            Some(word) => &token.lemma() == word,
+            // a None value in the phrase corresponds to a stop word,
+            // the walue is considered a match if the current token is categorized as a stop word.
+            None => token.is_stopword(),
+        };
+
+        let char_len = token.char_end - token.char_start;
+        // if there are remaining words to match in the phrase and the current token is matching,
+        // return a new Partial match allowing the highlighter to continue.
+        if is_matching && matching_words.len() > 1 {
+            matching_words.remove(0);
+            Some(MatchType::Partial(PartialMatch { matching_words, ids, char_len }))
+        // if there is no remaining word to match in the phrase and the current token is matching,
+        // return a Full match.
+        } else if is_matching {
+            Some(MatchType::Full { char_len, ids })
+        // if the current token doesn't match, return None to break the match sequence.
+        } else {
+            None
+        }
+    }
+
+    pub fn char_len(&self) -> usize {
+        self.char_len
+    }
+}
+
+impl fmt::Debug for MatchingWords {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
+
+        let phrases: Vec<_> = phrases
+            .iter()
+            .map(|p| {
+                (
+                    phrase_interner
+                        .get(p.value)
+                        .words
+                        .iter()
+                        .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
+                        .collect::<Vec<_>>()
+                        .join(" "),
+                    p.positions.clone(),
+                )
+            })
+            .collect();
+
+        let words: Vec<_> = words
+            .iter()
+            .flat_map(|w| {
+                w.value
+                    .iter()
+                    .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
+                    .collect::<Vec<_>>()
+            })
+            .collect();
+
+        f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use std::borrow::Cow;
+
+    use charabia::{TokenKind, TokenizerBuilder};
+
+    use super::super::super::located_query_terms_from_string;
+    use super::*;
+    use crate::index::tests::TempIndex;
+
+    pub(crate) fn temp_index_with_documents() -> TempIndex {
+        let temp_index = TempIndex::new();
+        temp_index
+            .add_documents(documents!([
+                { "id": 1, "name": "split this world westfali westfalia the" },
+            ]))
+            .unwrap();
+        temp_index
+    }
+
+    #[test]
+    fn matching_words() {
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let mut ctx = SearchContext::new(&temp_index, &rtxn);
+        let tokenizer = TokenizerBuilder::new().build();
+        let tokens = tokenizer.tokenize("split this world");
+        let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
+        let matching_words = MatchingWords::new(ctx, query_terms);
+
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("split"),
+                    char_end: "split".chars().count(),
+                    byte_end: "split".len(),
+                    ..Default::default()
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &(0..=0) })
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("nyc"),
+                    char_end: "nyc".chars().count(),
+                    byte_end: "nyc".len(),
+                    ..Default::default()
+                })
+                .next(),
+            None
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("world"),
+                    char_end: "world".chars().count(),
+                    byte_end: "world".len(),
+                    ..Default::default()
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("worlded"),
+                    char_end: "worlded".chars().count(),
+                    byte_end: "worlded".len(),
+                    ..Default::default()
+                })
+                .next(),
+            Some(MatchType::Full { char_len: 5, ids: &(2..=2) })
+        );
+        assert_eq!(
+            matching_words
+                .match_token(&Token {
+                    kind: TokenKind::Word,
+                    lemma: Cow::Borrowed("thisnew"),
+                    char_end: "thisnew".chars().count(),
+                    byte_end: "thisnew".len(),
+                    ..Default::default()
+                })
+                .next(),
+            None
+        );
+    }
+}
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@ -1,8 +1,8 @@
 use std::borrow::Cow;

 use charabia::{SeparatorKind, Token, Tokenizer};
-use matching_words::{MatchType, PartialMatch, PrimitiveWordId};
-pub use matching_words::{MatchingWord, MatchingWords};
+pub use matching_words::MatchingWords;
+use matching_words::{MatchType, PartialMatch, WordId};
 use serde::Serialize;

 pub mod matching_words;
@ -88,7 +88,7 @@ impl FormatOptions {
 pub struct Match {
    match_len: usize,
    // ids of the query words that matches.
-    ids: Vec<PrimitiveWordId>,
+    ids: Vec<WordId>,
    // position of the word in the whole text.
    word_position: usize,
    // position of the token in the whole text.
@ -137,11 +137,12 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
                    }
                    // partial match is now full, we keep this matches and we advance positions
                    Some(MatchType::Full { char_len, ids }) => {
+                        let ids: Vec<_> = ids.clone().into_iter().collect();
                        // save previously matched tokens as matches.
                        let iter = potential_matches.into_iter().map(
                            |(token_position, word_position, match_len)| Match {
                                match_len,
-                                ids: ids.to_vec(),
+                                ids: ids.clone(),
                                word_position,
                                token_position,
                            },
@ -151,7 +152,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
                        // save the token that closes the partial match as a match.
                        matches.push(Match {
                            match_len: char_len,
-                            ids: ids.to_vec(),
+                            ids,
                            word_position,
                            token_position,
                        });
@ -191,9 +192,10 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
                    // we match, we save the current token as a match,
                    // then we continue the rest of the tokens.
                    MatchType::Full { char_len, ids } => {
+                        let ids: Vec<_> = ids.clone().into_iter().collect();
                        matches.push(Match {
                            match_len: char_len,
-                            ids: ids.to_vec(),
+                            ids,
                            word_position,
                            token_position,
                        });
@ -334,7 +336,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
    /// 2) calculate distance between matches
    /// 3) count ordered matches
    fn match_interval_score(&self, matches: &[Match]) -> (i16, i16, i16) {
-        let mut ids: Vec<PrimitiveWordId> = Vec::with_capacity(matches.len());
+        let mut ids: Vec<WordId> = Vec::with_capacity(matches.len());
        let mut order_score = 0;
        let mut distance_score = 0;

@ -494,39 +496,29 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {

 #[cfg(test)]
 mod tests {
-    use std::rc::Rc;
-
    use charabia::TokenizerBuilder;
+    use matching_words::tests::temp_index_with_documents;

+    use super::super::located_query_terms_from_string;
    use super::*;
-    use crate::search::matches::matching_words::MatchingWord;
+    use crate::SearchContext;

-    fn matching_words() -> MatchingWords {
-        let all = vec![
-            Rc::new(MatchingWord::new("split".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
-        ];
-        let matching_words = vec![
-            (vec![all[0].clone()], vec![0]),
-            (vec![all[1].clone()], vec![1]),
-            (vec![all[2].clone()], vec![2]),
-        ];
-
-        MatchingWords::new(matching_words).unwrap()
-    }
-
-    impl MatcherBuilder<'_, Vec<u8>> {
-        pub fn from_matching_words(matching_words: MatchingWords) -> Self {
-            Self::new(matching_words, TokenizerBuilder::default().build())
+    impl<'a> MatcherBuilder<'a, &[u8]> {
+        pub fn new_test(mut ctx: SearchContext, query: &'a str) -> Self {
+            let tokenizer = TokenizerBuilder::new().build();
+            let tokens = tokenizer.tokenize(query);
+            let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
+            let matching_words = MatchingWords::new(ctx, query_terms);
+            Self::new(matching_words, TokenizerBuilder::new().build())
        }
    }

    #[test]
    fn format_identity() {
-        let matching_words = matching_words();
-
-        let builder = MatcherBuilder::from_matching_words(matching_words);
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(ctx, "split the world");

        let format_options = FormatOptions { highlight: false, crop: None };

@ -551,9 +543,10 @@ mod tests {

    #[test]
    fn format_highlight() {
-        let matching_words = matching_words();
-
-        let builder = MatcherBuilder::from_matching_words(matching_words);
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(ctx, "split the world");

        let format_options = FormatOptions { highlight: true, crop: None };

@ -594,16 +587,10 @@ mod tests {

    #[test]
    fn highlight_unicode() {
-        let all = vec![
-            Rc::new(MatchingWord::new("wessfali".to_string(), 1, true).unwrap()),
-            Rc::new(MatchingWord::new("world".to_string(), 1, true).unwrap()),
-        ];
-        let matching_words = vec![(vec![all[0].clone()], vec![0]), (vec![all[1].clone()], vec![1])];
-
-        let matching_words = MatchingWords::new(matching_words).unwrap();
-
-        let builder = MatcherBuilder::from_matching_words(matching_words);
-
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(ctx, "world");
        let format_options = FormatOptions { highlight: true, crop: None };

        // Text containing prefix match.
@ -624,6 +611,10 @@ mod tests {
            @"<em>Ŵôřlḑ</em>"
        );

+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(ctx, "westfali");
+        let format_options = FormatOptions { highlight: true, crop: None };
+
        // Text containing unicode match.
        let text = "Westfália";
        let mut matcher = builder.build(text);
@ -636,9 +627,10 @@ mod tests {

    #[test]
    fn format_crop() {
-        let matching_words = matching_words();
-
-        let builder = MatcherBuilder::from_matching_words(matching_words);
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(ctx, "split the world");

        let format_options = FormatOptions { highlight: false, crop: Some(10) };

@ -733,9 +725,10 @@ mod tests {

    #[test]
    fn format_highlight_crop() {
-        let matching_words = matching_words();
-
-        let builder = MatcherBuilder::from_matching_words(matching_words);
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(ctx, "split the world");

        let format_options = FormatOptions { highlight: true, crop: Some(10) };

@ -795,9 +788,10 @@ mod tests {
    #[test]
    fn smaller_crop_size() {
        //! testing: https://github.com/meilisearch/specifications/pull/120#discussion_r836536295
-        let matching_words = matching_words();
-
-        let builder = MatcherBuilder::from_matching_words(matching_words);
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let builder = MatcherBuilder::new_test(ctx, "split the world");

        let text = "void void split the world void void.";

@ -831,25 +825,10 @@ mod tests {

    #[test]
    fn partial_matches() {
-        let all = vec![
-            Rc::new(MatchingWord::new("the".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("t".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("he".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("door".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("do".to_string(), 0, false).unwrap()),
-            Rc::new(MatchingWord::new("or".to_string(), 0, false).unwrap()),
-        ];
-        let matching_words = vec![
-            (vec![all[0].clone()], vec![0]),
-            (vec![all[1].clone(), all[2].clone()], vec![0]),
-            (vec![all[3].clone()], vec![1]),
-            (vec![all[4].clone(), all[5].clone()], vec![1]),
-            (vec![all[4].clone()], vec![2]),
-        ];
-
-        let matching_words = MatchingWords::new(matching_words).unwrap();
-
-        let mut builder = MatcherBuilder::from_matching_words(matching_words);
+        let temp_index = temp_index_with_documents();
+        let rtxn = temp_index.read_txn().unwrap();
+        let ctx = SearchContext::new(&temp_index, &rtxn);
+        let mut builder = MatcherBuilder::new_test(ctx, "the \"t he\" door \"do or\"");
        builder.highlight_prefix("_".to_string());
        builder.highlight_suffix("_".to_string());

@ -859,7 +838,7 @@ mod tests {
        let mut matcher = builder.build(text);
        insta::assert_snapshot!(
            matcher.format(format_options),
-            @"_the_ _do_ _or_ die can't be he _do_ and or isn'_t_ _he_"
+            @"_the_ _do_ _or_ die can't be he do and or isn'_t_ _he_"
        );
    }
 }
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -5,6 +5,7 @@ mod graph_based_ranking_rule;
 mod interner;
 mod limits;
 mod logger;
+pub mod matches;
 mod query_graph;
 mod query_term;
 mod ranking_rule_graph;
@ -33,8 +34,8 @@ use interner::DedupInterner;
 pub use logger::detailed::DetailedSearchLogger;
 pub use logger::{DefaultSearchLogger, SearchLogger};
 use query_graph::{QueryGraph, QueryNode};
-use query_term::{located_query_terms_from_string, Phrase, QueryTerm};
-use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
+use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm};
+use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
 use resolve_query_graph::PhraseDocIdsCache;
 use roaring::RoaringBitmap;
 use words::Words;
@ -47,10 +48,7 @@ use self::ranking_rules::{BoxRankingRule, RankingRule};
 use self::resolve_query_graph::compute_query_graph_docids;
 use self::sort::Sort;
 use crate::search::new::distinct::apply_distinct_rule;
-use crate::{
-    AscDesc, Filter, Index, MatchingWords, Member, Result, SearchResult, TermsMatchingStrategy,
-    UserError,
-};
+use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};

 /// A structure used throughout the execution of a search query.
 pub struct SearchContext<'ctx> {
@ -62,6 +60,7 @@ pub struct SearchContext<'ctx> {
    pub term_interner: Interner<QueryTerm>,
    pub phrase_docids: PhraseDocIdsCache,
 }
+
 impl<'ctx> SearchContext<'ctx> {
    pub fn new(index: &'ctx Index, txn: &'ctx RoTxn<'ctx>) -> Self {
        Self {
@ -291,13 +290,14 @@ pub fn execute_search(
    words_limit: Option<usize>,
    placeholder_search_logger: &mut dyn SearchLogger<PlaceholderQuery>,
    query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
-) -> Result<SearchResult> {
+) -> Result<PartialSearchResult> {
    let mut universe = if let Some(filters) = filters {
        filters.evaluate(ctx.txn, ctx.index)?
    } else {
        ctx.index.documents_ids(ctx.txn)?
    };

+    let mut located_query_terms = None;
    let bucket_sort_output = if let Some(query) = query {
        // We make sure that the analyzer is aware of the stop words
        // this ensures that the query builder is able to properly remove them.
@ -317,6 +317,7 @@ pub fn execute_search(

        let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
        let graph = QueryGraph::from_query(ctx, &query_terms)?;
+        located_query_terms = Some(query_terms);

        check_sort_criteria(ctx, sort_criteria.as_ref())?;

@ -357,9 +358,7 @@ pub fn execute_search(
        }
    }

-    Ok(SearchResult {
-        // TODO: correct matching words
-        matching_words: MatchingWords::default(),
+    Ok(PartialSearchResult {
        candidates: all_candidates,
        documents_ids: docids,
    })
@ -406,3 +405,9 @@ fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>

    Ok(())
 }
+
+pub struct PartialSearchResult {
+    pub located_query_terms: Option<Vec<LocatedQueryTerm>>,
+    pub candidates: RoaringBitmap,
+    pub documents_ids: Vec<DocumentId>,
+}