Support ngram typos + splitwords and splitwords+synonyms in proximity

2025-07-04 20:37:15 +02:00 · 2023-03-13 17:21:29 +01:00 · 2023-03-13 17:21:29 +01:00 · 3004e281d7
commit 3004e281d7
parent 14e8d0aaa2
9 changed files with 701 additions and 411 deletions
--- a/milli/src/search/new/logger/detailed.rs
+++ b/milli/src/search/new/logger/detailed.rs
@ -447,6 +447,8 @@ results.{random} {{
                        use_prefix_db,
                        synonyms,
                        split_words,
+                        prefix_of,
+                        is_prefix: _,
                    } = ctx.derivations_interner.get(*derivations);

                    let original = ctx.word_interner.get(*original);
@ -460,6 +462,10 @@ shape: class"
                        let w = ctx.word_interner.get(w);
                        writeln!(file, "\"{w}\" : 0").unwrap();
                    }
+                    for w in prefix_of.iter().copied() {
+                        let w = ctx.word_interner.get(w);
+                        writeln!(file, "\"{w}\" : 0P").unwrap();
+                    }
                    for w in one_typo.iter().copied() {
                        let w = ctx.word_interner.get(w);
                        writeln!(file, "\"{w}\" : 1").unwrap();
@ -478,8 +484,9 @@ shape: class"
                        let phrase_str = phrase.description(&ctx.word_interner);
                        writeln!(file, "\"{phrase_str}\" : synonym").unwrap();
                    }
-                    if *use_prefix_db {
-                        writeln!(file, "use prefix DB : true").unwrap();
+                    if let Some(use_prefix_db) = use_prefix_db {
+                        let p = ctx.word_interner.get(*use_prefix_db);
+                        writeln!(file, "use prefix DB : {p}").unwrap();
                    }
                    for (d, edges) in distances.iter() {
                        writeln!(file, "\"distance {d}\" : {:?}", edges.iter().collect::<Vec<_>>())
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -295,45 +295,45 @@ mod tests {

        println!("nbr docids: {}", index.documents_ids(&txn).unwrap().len());

-        loop {
-            let start = Instant::now();
+        // loop {
+        let start = Instant::now();

-            // let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
-            let mut ctx = SearchContext::new(&index, &txn);
-            let results = execute_search(
-                &mut ctx,
-                "zero config",
-                TermsMatchingStrategy::Last,
-                None,
-                0,
-                20,
-                &mut DefaultSearchLogger,
-                &mut DefaultSearchLogger,
-            )
-            .unwrap();
+        let mut logger = crate::search::new::logger::detailed::DetailedSearchLogger::new("log");
+        let mut ctx = SearchContext::new(&index, &txn);
+        let results = execute_search(
+            &mut ctx,
+            "sun flower s are the best",
+            TermsMatchingStrategy::Last,
+            None,
+            0,
+            20,
+            &mut DefaultSearchLogger,
+            &mut logger,
+        )
+        .unwrap();

-            // logger.write_d2_description(&mut ctx);
+        logger.write_d2_description(&mut ctx);

-            let elapsed = start.elapsed();
-            println!("{}us", elapsed.as_micros());
+        let elapsed = start.elapsed();
+        println!("{}us", elapsed.as_micros());

-            let _documents = index
-                .documents(&txn, results.iter().copied())
-                .unwrap()
-                .into_iter()
-                .map(|(id, obkv)| {
-                    let mut object = serde_json::Map::default();
-                    for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
-                        let value = obkv.get(fid).unwrap();
-                        let value: serde_json::Value = serde_json::from_slice(value).unwrap();
-                        object.insert(fid_name.to_owned(), value);
-                    }
-                    (id, serde_json::to_string_pretty(&object).unwrap())
-                })
-                .collect::<Vec<_>>();
+        let _documents = index
+            .documents(&txn, results.iter().copied())
+            .unwrap()
+            .into_iter()
+            .map(|(id, obkv)| {
+                let mut object = serde_json::Map::default();
+                for (fid, fid_name) in index.fields_ids_map(&txn).unwrap().iter() {
+                    let value = obkv.get(fid).unwrap();
+                    let value: serde_json::Value = serde_json::from_slice(value).unwrap();
+                    object.insert(fid_name.to_owned(), value);
+                }
+                (id, serde_json::to_string_pretty(&object).unwrap())
+            })
+            .collect::<Vec<_>>();

-            println!("{}us: {:?}", elapsed.as_micros(), results);
-        }
+        println!("{}us: {:?}", elapsed.as_micros(), results);
+        // }
        // for (id, _document) in documents {
        //     println!("{id}:");
        //     // println!("{document}");
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@ -1,4 +1,4 @@
-use super::query_term::{self, LocatedQueryTerm, QueryTerm, WordDerivations};
+use super::query_term::{self, number_of_typos_allowed, LocatedQueryTerm};
 use super::small_bitmap::SmallBitmap;
 use super::SearchContext;
 use crate::Result;
@ -132,18 +132,18 @@ impl QueryGraph {
 impl QueryGraph {
    /// Build the query graph from the parsed user search query.
    pub fn from_query(ctx: &mut SearchContext, terms: Vec<LocatedQueryTerm>) -> Result<QueryGraph> {
+        let nbr_typos = number_of_typos_allowed(ctx)?;
+
        let mut empty_nodes = vec![];

-        let word_set = ctx.index.words_fst(ctx.txn)?;
        let mut graph = QueryGraph::default();

+        // TODO: we could consider generalizing to 4,5,6,7,etc. ngrams
        let (mut prev2, mut prev1, mut prev0): (Vec<u16>, Vec<u16>, Vec<u16>) =
            (vec![], vec![], vec![graph.root_node]);

-        for length in 1..=terms.len() {
-            let query = &terms[..length];
-
-            let term0 = query.last().unwrap();
+        for term_idx in 0..terms.len() {
+            let term0 = &terms[term_idx];

            let mut new_nodes = vec![];
            let new_node_idx = graph.add_node(&prev0, QueryNode::Term(term0.clone()));
@ -153,57 +153,19 @@ impl QueryGraph {
            }

            if !prev1.is_empty() {
-                if let Some((ngram2_str, ngram2_pos)) =
-                    query_term::ngram2(ctx, &query[length - 2], &query[length - 1])
+                if let Some(ngram) =
+                    query_term::make_ngram(ctx, &terms[term_idx - 1..=term_idx], &nbr_typos)?
                {
-                    if word_set.contains(ctx.word_interner.get(ngram2_str)) {
-                        let ngram2 = LocatedQueryTerm {
-                            value: QueryTerm::Word {
-                                derivations: ctx.derivations_interner.insert(WordDerivations {
-                                    original: ngram2_str,
-                                    // TODO: could add a typo if it's an ngram?
-                                    zero_typo: Box::new([ngram2_str]),
-                                    one_typo: Box::new([]),
-                                    two_typos: Box::new([]),
-                                    use_prefix_db: false,
-                                    synonyms: Box::new([]), // TODO: ngram synonyms
-                                    split_words: None,      // TODO: maybe ngram split words?
-                                }),
-                            },
-                            positions: ngram2_pos,
-                        };
-                        let ngram2_idx = graph.add_node(&prev1, QueryNode::Term(ngram2));
-                        new_nodes.push(ngram2_idx);
-                    }
+                    let ngram_idx = graph.add_node(&prev1, QueryNode::Term(ngram));
+                    new_nodes.push(ngram_idx);
                }
            }
            if !prev2.is_empty() {
-                if let Some((ngram3_str, ngram3_pos)) = query_term::ngram3(
-                    ctx,
-                    &query[length - 3],
-                    &query[length - 2],
-                    &query[length - 1],
-                ) {
-                    if word_set.contains(ctx.word_interner.get(ngram3_str)) {
-                        let ngram3 = LocatedQueryTerm {
-                            value: QueryTerm::Word {
-                                derivations: ctx.derivations_interner.insert(WordDerivations {
-                                    original: ngram3_str,
-                                    // TODO: could add a typo if it's an ngram?
-                                    zero_typo: Box::new([ngram3_str]),
-                                    one_typo: Box::new([]),
-                                    two_typos: Box::new([]),
-                                    use_prefix_db: false,
-                                    synonyms: Box::new([]), // TODO: ngram synonyms
-                                    split_words: None,      // TODO: maybe ngram split words?
-                                                            // would be nice for typos like su nflower
-                                }),
-                            },
-                            positions: ngram3_pos,
-                        };
-                        let ngram3_idx = graph.add_node(&prev2, QueryNode::Term(ngram3));
-                        new_nodes.push(ngram3_idx);
-                    }
+                if let Some(ngram) =
+                    query_term::make_ngram(ctx, &terms[term_idx - 2..=term_idx], &nbr_typos)?
+                {
+                    let ngram_idx = graph.add_node(&prev2, QueryNode::Term(ngram));
+                    new_nodes.push(ngram_idx);
                }
            }
            (prev0, prev1, prev2) = (new_nodes, prev0, prev1);
--- a/milli/src/search/new/query_term.rs
+++ b/milli/src/search/new/query_term.rs
@ -13,7 +13,7 @@ use super::interner::{Interned, Interner};
 use super::SearchContext;
 use crate::search::fst_utils::{Complement, Intersection, StartsWith, Union};
 use crate::search::{build_dfa, get_first};
-use crate::{CboRoaringBitmapLenCodec, Index, Result};
+use crate::{CboRoaringBitmapLenCodec, Index, Result, MAX_WORD_LENGTH};

 /// A phrase in the user's search query, consisting of several words
 /// that must appear side-by-side in the search results.
@ -31,46 +31,70 @@ impl Phrase {
 /// a term in the user's search query.
 #[derive(Clone, PartialEq, Eq, Hash)]
 pub struct WordDerivations {
-    /// The original word
+    /// The original terms, for debugging purposes
    pub original: Interned<String>,
-    // TODO: original should only be used for debugging purposes?
-    // TODO: pub zero_typo: Option<Interned<String>>,
-    // TODO: pub prefix_of: Box<[Interned<String>]>,
+    pub is_prefix: bool,
+
+    /// A single word equivalent to the original one, with zero typos
+    pub zero_typo: Option<Interned<String>>,
+    /// All the words that contain the original word as prefix
+    pub prefix_of: Box<[Interned<String>]>,
    /// All the synonyms of the original word
    pub synonyms: Box<[Interned<Phrase>]>,

    /// The original word split into multiple consecutive words
    pub split_words: Option<Interned<Phrase>>,

-    /// The original words and words which are prefixed by it
-    pub zero_typo: Box<[Interned<String>]>,
-
    /// Words that are 1 typo away from the original word
    pub one_typo: Box<[Interned<String>]>,

    /// Words that are 2 typos away from the original word
    pub two_typos: Box<[Interned<String>]>,

-    /// True if the prefix databases must be used to retrieve
-    /// the words which are prefixed by the original word.
-    pub use_prefix_db: bool,
+    /// A prefix in the prefix databases matching the original word
+    pub use_prefix_db: Option<Interned<String>>,
 }
 impl WordDerivations {
+    pub fn empty(word_interner: &mut Interner<String>, original: &str) -> Self {
+        Self {
+            original: word_interner.insert(original.to_owned()),
+            is_prefix: false,
+            zero_typo: None,
+            prefix_of: Box::new([]),
+            synonyms: Box::new([]),
+            split_words: None,
+            one_typo: Box::new([]),
+            two_typos: Box::new([]),
+            use_prefix_db: None,
+        }
+    }
    /// Return an iterator over all the single words derived from the original word.
    ///
    /// This excludes synonyms, split words, and words stored in the prefix databases.
    pub fn all_single_word_derivations_except_prefix_db(
        &'_ self,
    ) -> impl Iterator<Item = Interned<String>> + Clone + '_ {
-        self.zero_typo.iter().chain(self.one_typo.iter()).chain(self.two_typos.iter()).copied()
+        self.zero_typo
+            .iter()
+            .chain(self.prefix_of.iter())
+            .chain(self.one_typo.iter())
+            .chain(self.two_typos.iter())
+            .copied()
+    }
+    /// Return an iterator over all the single words derived from the original word.
+    ///
+    /// This excludes synonyms, split words, and words stored in the prefix databases.
+    pub fn all_phrase_derivations(&'_ self) -> impl Iterator<Item = Interned<Phrase>> + Clone + '_ {
+        self.split_words.iter().chain(self.synonyms.iter()).copied()
    }
    pub fn is_empty(&self) -> bool {
-        self.zero_typo.is_empty()
+        self.zero_typo.is_none()
            && self.one_typo.is_empty()
            && self.two_typos.is_empty()
+            && self.prefix_of.is_empty()
            && self.synonyms.is_empty()
            && self.split_words.is_none()
-            && !self.use_prefix_db
+            && self.use_prefix_db.is_none()
    }
 }

@ -80,7 +104,11 @@ pub fn word_derivations(
    word: &str,
    max_typo: u8,
    is_prefix: bool,
-) -> Result<Interned<WordDerivations>> {
+) -> Result<WordDerivations> {
+    if word.len() > MAX_WORD_LENGTH {
+        return Ok(WordDerivations::empty(&mut ctx.word_interner, word));
+    }
+
    let fst = ctx.index.words_fst(ctx.txn)?;
    let word_interned = ctx.word_interner.insert(word.to_owned());

@ -91,23 +119,29 @@ pub fn word_derivations(
            .remap_data_type::<DecodeIgnore>()
            .get(ctx.txn, word)?
            .is_some();
+    let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };

-    let mut zero_typo = vec![];
+    let mut zero_typo = None;
+    let mut prefix_of = vec![];
    let mut one_typo = vec![];
    let mut two_typos = vec![];

+    if fst.contains(word) {
+        zero_typo = Some(word_interned);
+    }
+
    if max_typo == 0 {
-        if is_prefix && !use_prefix_db {
+        if is_prefix && use_prefix_db.is_none() {
            let prefix = Str::new(word).starts_with();
            let mut stream = fst.search(prefix).into_stream();

            while let Some(derived_word) = stream.next() {
                let derived_word = std::str::from_utf8(derived_word)?.to_owned();
                let derived_word_interned = ctx.word_interner.insert(derived_word);
-                zero_typo.push(derived_word_interned);
+                if derived_word_interned != word_interned {
+                    prefix_of.push(derived_word_interned);
+                }
            }
-        } else if fst.contains(word) {
-            zero_typo.push(word_interned);
        }
    } else if max_typo == 1 {
        let dfa = build_dfa(word, 1, is_prefix);
@ -122,7 +156,9 @@ pub fn word_derivations(
            let derived_word_interned = ctx.word_interner.insert(derived_word.to_owned());
            match d.to_u8() {
                0 => {
-                    zero_typo.push(derived_word_interned);
+                    if derived_word_interned != word_interned {
+                        prefix_of.push(derived_word_interned);
+                    }
                }
                1 => {
                    one_typo.push(derived_word_interned);
@ -153,7 +189,9 @@ pub fn word_derivations(
                let d = second_dfa.distance((state.1).0);
                match d.to_u8() {
                    0 => {
-                        zero_typo.push(derived_word_interned);
+                        if derived_word_interned != word_interned {
+                            prefix_of.push(derived_word_interned);
+                        }
                    }
                    1 => {
                        one_typo.push(derived_word_interned);
@ -185,17 +223,17 @@ pub fn word_derivations(
        })
        .collect();

-    let interned = ctx.derivations_interner.insert(WordDerivations {
-        original: ctx.word_interner.insert(word.to_owned()),
+    Ok(WordDerivations {
+        original: word_interned,
+        is_prefix,
+        zero_typo,
+        prefix_of: prefix_of.into_boxed_slice(),
        synonyms,
        split_words,
-        zero_typo: zero_typo.into_boxed_slice(),
        one_typo: one_typo.into_boxed_slice(),
        two_typos: two_typos.into_boxed_slice(),
        use_prefix_db,
-    });
-
-    Ok(interned)
+    })
 }

 /// Split the original word into the two words that appear the
@ -236,12 +274,17 @@ pub enum QueryTerm {
 }

 impl QueryTerm {
+    pub fn is_prefix(&self, derivations_interner: &Interner<WordDerivations>) -> bool {
+        match self {
+            QueryTerm::Phrase { .. } => false,
+            QueryTerm::Word { derivations } => derivations_interner.get(*derivations).is_prefix,
+        }
+    }
    /// Return the original word from the given query term
-    pub fn original_single_word<'interner>(
+    pub fn original_single_word(
        &self,
-        word_interner: &'interner Interner<String>,
-        derivations_interner: &'interner Interner<WordDerivations>,
-    ) -> Option<&'interner str> {
+        derivations_interner: &Interner<WordDerivations>,
+    ) -> Option<Interned<String>> {
        match self {
            QueryTerm::Phrase { phrase: _ } => None,
            QueryTerm::Word { derivations } => {
@ -249,7 +292,7 @@ impl QueryTerm {
                if derivations.is_empty() {
                    None
                } else {
-                    Some(word_interner.get(derivations.original))
+                    Some(derivations.original)
                }
            }
        }
@ -281,25 +324,7 @@ pub fn located_query_terms_from_string<'ctx>(
    query: NormalizedTokenIter<Vec<u8>>,
    words_limit: Option<usize>,
 ) -> Result<Vec<LocatedQueryTerm>> {
-    let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
-    let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
-    let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
-
-    // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
-    let exact_words = ctx.index.exact_words(ctx.txn)?;
-
-    let nbr_typos = |word: &str| {
-        if !authorize_typos
-            || word.len() < min_len_one_typo as usize
-            || exact_words.as_ref().map_or(false, |fst| fst.contains(word))
-        {
-            0
-        } else if word.len() < min_len_two_typos as usize {
-            1
-        } else {
-            2
-        }
-    };
+    let nbr_typos = number_of_typos_allowed(ctx)?;

    let mut located_terms = Vec::new();

@ -344,7 +369,9 @@ pub fn located_query_terms_from_string<'ctx>(
                            let word = token.lemma();
                            let derivations = word_derivations(ctx, word, nbr_typos(word), false)?;
                            let located_term = LocatedQueryTerm {
-                                value: QueryTerm::Word { derivations },
+                                value: QueryTerm::Word {
+                                    derivations: ctx.derivations_interner.insert(derivations),
+                                },
                                positions: position..=position,
                            };
                            located_terms.push(located_term);
@ -355,7 +382,9 @@ pub fn located_query_terms_from_string<'ctx>(
                    let word = token.lemma();
                    let derivations = word_derivations(ctx, word, nbr_typos(word), true)?;
                    let located_term = LocatedQueryTerm {
-                        value: QueryTerm::Word { derivations },
+                        value: QueryTerm::Word {
+                            derivations: ctx.derivations_interner.insert(derivations),
+                        },
                        positions: position..=position,
                    };
                    located_terms.push(located_term);
@ -407,54 +436,171 @@ pub fn located_query_terms_from_string<'ctx>(
    Ok(located_terms)
 }

-// TODO: return a word derivations instead?
-pub fn ngram2(
-    ctx: &mut SearchContext,
-    x: &LocatedQueryTerm,
-    y: &LocatedQueryTerm,
-) -> Option<(Interned<String>, RangeInclusive<i8>)> {
-    if *x.positions.end() != y.positions.start() - 1 {
-        return None;
-    }
-    match (
-        &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
-        &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
-    ) {
-        (Some(w1), Some(w2)) => {
-            let term = (
-                ctx.word_interner.insert(format!("{w1}{w2}")),
-                *x.positions.start()..=*y.positions.end(),
-            );
-            Some(term)
+pub fn number_of_typos_allowed<'ctx>(
+    ctx: &SearchContext<'ctx>,
+) -> Result<impl Fn(&str) -> u8 + 'ctx> {
+    let authorize_typos = ctx.index.authorize_typos(ctx.txn)?;
+    let min_len_one_typo = ctx.index.min_word_len_one_typo(ctx.txn)?;
+    let min_len_two_typos = ctx.index.min_word_len_two_typos(ctx.txn)?;
+
+    // TODO: should `exact_words` also disable prefix search, ngrams, split words, or synonyms?
+    let exact_words = ctx.index.exact_words(ctx.txn)?;
+
+    Ok(Box::new(move |word: &str| {
+        if !authorize_typos
+            || word.len() < min_len_one_typo as usize
+            || exact_words.as_ref().map_or(false, |fst| fst.contains(word))
+        {
+            0
+        } else if word.len() < min_len_two_typos as usize {
+            1
+        } else {
+            2
        }
-        _ => None,
-    }
+    }))
 }

-// TODO: return a word derivations instead?
-pub fn ngram3(
+pub fn make_ngram(
    ctx: &mut SearchContext,
-    x: &LocatedQueryTerm,
-    y: &LocatedQueryTerm,
-    z: &LocatedQueryTerm,
-) -> Option<(Interned<String>, RangeInclusive<i8>)> {
-    if *x.positions.end() != y.positions.start() - 1
-        || *y.positions.end() != z.positions.start() - 1
-    {
-        return None;
-    }
-    match (
-        &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
-        &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
-        &z.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
-    ) {
-        (Some(w1), Some(w2), Some(w3)) => {
-            let term = (
-                ctx.word_interner.insert(format!("{w1}{w2}{w3}")),
-                *x.positions.start()..=*z.positions.end(),
-            );
-            Some(term)
+    terms: &[LocatedQueryTerm],
+    number_of_typos_allowed: &impl Fn(&str) -> u8,
+) -> Result<Option<LocatedQueryTerm>> {
+    assert!(!terms.is_empty());
+    for ts in terms.windows(2) {
+        let [t1, t2] = ts else { panic!() };
+        if *t1.positions.end() != t2.positions.start() - 1 {
+            return Ok(None);
        }
-        _ => None,
    }
+    let mut words_interned = vec![];
+    for term in terms {
+        if let Some(original_term_word) = term.value.original_single_word(&ctx.derivations_interner)
+        {
+            words_interned.push(original_term_word);
+        } else {
+            return Ok(None);
+        }
+    }
+    let words =
+        words_interned.iter().map(|&i| ctx.word_interner.get(i).to_owned()).collect::<Vec<_>>();
+
+    let start = *terms.first().as_ref().unwrap().positions.start();
+    let end = *terms.last().as_ref().unwrap().positions.end();
+    let is_prefix = terms.last().as_ref().unwrap().value.is_prefix(&ctx.derivations_interner);
+    let ngram_str = words.join("");
+    if ngram_str.len() > MAX_WORD_LENGTH {
+        return Ok(None);
+    }
+
+    let mut derivations = word_derivations(
+        ctx,
+        &ngram_str,
+        number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8),
+        is_prefix,
+    )?;
+    derivations.original = ctx.word_interner.insert(words.join(" "));
+    // Now add the synonyms
+    let index_synonyms = ctx.index.synonyms(ctx.txn)?;
+    let mut derivations_synonyms = derivations.synonyms.to_vec();
+    derivations_synonyms.extend(
+        index_synonyms.get(&words).cloned().unwrap_or_default().into_iter().map(|words| {
+            let words = words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
+            ctx.phrase_interner.insert(Phrase { words })
+        }),
+    );
+    derivations.synonyms = derivations_synonyms.into_boxed_slice();
+    if let Some(split_words) = derivations.split_words {
+        let split_words = ctx.phrase_interner.get(split_words);
+        if split_words.words == words_interned.iter().map(|&i| Some(i)).collect::<Vec<_>>() {
+            derivations.split_words = None;
+        }
+    }
+    if derivations.is_empty() {
+        return Ok(None);
+    }
+    let term = LocatedQueryTerm {
+        value: QueryTerm::Word { derivations: ctx.derivations_interner.insert(derivations) },
+        positions: start..=end,
+    };
+
+    Ok(Some(term))
 }
+
+// // TODO: return a word derivations instead?
+// pub fn ngram2(
+//     ctx: &mut SearchContext,
+//     x: &LocatedQueryTerm,
+//     y: &LocatedQueryTerm,
+//     number_of_typos_allowed: impl Fn(&str) -> u8,
+// ) -> Result<Option<LocatedQueryTerm>> {
+//     if *x.positions.end() != y.positions.start() - 1 {
+//         return Ok(None);
+//     }
+//     match (
+//         x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
+//         y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
+//     ) {
+//         (Some(w1), Some(w2)) => {
+//             let ngram2_str = format!("{w1}{w2}");
+//             let mut derivations = word_derivations(
+//                 ctx,
+//                 &ngram2_str,
+//                 number_of_typos_allowed(ngram2_str.as_str()).saturating_sub(1),
+//                 y.value.is_prefix(&ctx.derivations_interner),
+//             )?;
+//             // Now add the synonyms
+//             let index_synonyms = ctx.index.synonyms(ctx.txn)?;
+//             let mut derivations_synonyms = derivations.synonyms.to_vec();
+//             derivations_synonyms.extend(
+//                 index_synonyms
+//                     .get(&vec![w1.to_owned(), w2.to_owned()])
+//                     .cloned()
+//                     .unwrap_or_default()
+//                     .into_iter()
+//                     .map(|words| {
+//                         let words =
+//                             words.into_iter().map(|w| Some(ctx.word_interner.insert(w))).collect();
+//                         ctx.phrase_interner.insert(Phrase { words })
+//                     }),
+//             );
+
+//             let term = LocatedQueryTerm {
+//                 value: QueryTerm::Word {
+//                     derivations: ctx.derivations_interner.insert(derivations),
+//                 },
+//                 positions: *x.positions.start()..=*y.positions.end(),
+//             };
+
+//             Ok(Some(term))
+//         }
+//         _ => Ok(None),
+//     }
+// }
+
+// // TODO: return a word derivations instead?
+// pub fn ngram3(
+//     ctx: &mut SearchContext,
+//     x: &LocatedQueryTerm,
+//     y: &LocatedQueryTerm,
+//     z: &LocatedQueryTerm,
+// ) -> Option<(Interned<String>, RangeInclusive<i8>)> {
+//     if *x.positions.end() != y.positions.start() - 1
+//         || *y.positions.end() != z.positions.start() - 1
+//     {
+//         return None;
+//     }
+//     match (
+//         &x.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
+//         &y.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
+//         &z.value.original_single_word(&ctx.word_interner, &ctx.derivations_interner),
+//     ) {
+//         (Some(w1), Some(w2), Some(w3)) => {
+//             let term = (
+//                 ctx.word_interner.insert(format!("{w1}{w2}{w3}")),
+//                 *x.positions.start()..=*z.positions.end(),
+//             );
+//             Some(term)
+//         }
+//         _ => None,
+//     }
+// }
--- a/milli/src/search/new/ranking_rule_graph/proximity/build.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/build.rs
@ -1,37 +1,43 @@
+#![allow(clippy::too_many_arguments)]
 use std::collections::BTreeMap;

-use itertools::Itertools;
-
 use super::ProximityEdge;
-use crate::search::new::interner::Interner;
-use crate::search::new::query_term::{LocatedQueryTerm, QueryTerm, WordDerivations};
+use crate::search::new::db_cache::DatabaseCache;
+use crate::search::new::interner::{Interned, Interner};
+use crate::search::new::query_term::{LocatedQueryTerm, Phrase, QueryTerm, WordDerivations};
 use crate::search::new::ranking_rule_graph::proximity::WordPair;
 use crate::search::new::ranking_rule_graph::EdgeCondition;
 use crate::search::new::{QueryNode, SearchContext};
 use crate::Result;
+use heed::RoTxn;

 pub fn visit_from_node(
    ctx: &mut SearchContext,
    from_node: &QueryNode,
-) -> Result<Option<(WordDerivations, i8)>> {
-    Ok(Some(match from_node {
+) -> Result<Option<(Vec<(Option<Interned<Phrase>>, Interned<String>)>, i8)>> {
+    let SearchContext { derivations_interner, .. } = ctx;
+
+    let (left_phrase, left_derivations, left_end_position) = match from_node {
        QueryNode::Term(LocatedQueryTerm { value: value1, positions: pos1 }) => {
            match value1 {
                QueryTerm::Word { derivations } => {
-                    (ctx.derivations_interner.get(*derivations).clone(), *pos1.end())
+                    (None, derivations_interner.get(*derivations).clone(), *pos1.end())
                }
-                QueryTerm::Phrase { phrase: phrase1 } => {
-                    let phrase1 = ctx.phrase_interner.get(*phrase1);
-                    if let Some(original) = *phrase1.words.last().unwrap() {
+                QueryTerm::Phrase { phrase: phrase_interned } => {
+                    let phrase = ctx.phrase_interner.get(*phrase_interned);
+                    if let Some(original) = *phrase.words.last().unwrap() {
                        (
+                            Some(*phrase_interned),
                            WordDerivations {
                                original,
-                                zero_typo: Box::new([original]),
+                                zero_typo: Some(original),
                                one_typo: Box::new([]),
                                two_typos: Box::new([]),
-                                use_prefix_db: false,
+                                use_prefix_db: None,
                                synonyms: Box::new([]),
                                split_words: None,
+                                is_prefix: false,
+                                prefix_of: Box::new([]),
                            },
                            *pos1.end(),
                        )
@ -42,190 +48,175 @@ pub fn visit_from_node(
                }
            }
        }
-        QueryNode::Start => (
-            WordDerivations {
-                original: ctx.word_interner.insert(String::new()),
-                zero_typo: Box::new([]),
-                one_typo: Box::new([]),
-                two_typos: Box::new([]),
-                use_prefix_db: false,
-                synonyms: Box::new([]),
-                split_words: None,
-            },
-            -100,
-        ),
+        QueryNode::Start => (None, WordDerivations::empty(&mut ctx.word_interner, ""), -1),
        _ => return Ok(None),
-    }))
+    };
+
+    // left term cannot be a prefix
+    assert!(left_derivations.use_prefix_db.is_none() && !left_derivations.is_prefix);
+
+    let last_word_left_phrase = if let Some(left_phrase_interned) = left_phrase {
+        let left_phrase = ctx.phrase_interner.get(left_phrase_interned);
+        left_phrase.words.last().copied().unwrap()
+    } else {
+        None
+    };
+    let left_single_word_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> = left_derivations
+        .all_single_word_derivations_except_prefix_db()
+        .chain(last_word_left_phrase.iter().copied())
+        .map(|w| (left_phrase, w))
+        .collect();
+    let left_phrase_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> = left_derivations
+        .all_phrase_derivations()
+        .map(|left_phrase_interned: Interned<Phrase>| {
+            let left_phrase = ctx.phrase_interner.get(left_phrase_interned);
+            let last_word_left_phrase: Interned<String> =
+                left_phrase.words.last().unwrap().unwrap();
+            let r: (Option<Interned<Phrase>>, Interned<String>) =
+                (Some(left_phrase_interned), last_word_left_phrase);
+            r
+        })
+        .collect();
+    let mut left_word_iter = left_single_word_iter;
+    left_word_iter.extend(left_phrase_iter);
+
+    Ok(Some((left_word_iter, left_end_position)))
 }

-pub fn visit_to_node<'ctx, 'from_data>(
+pub fn build_step_visit_destination_node<'ctx, 'from_data>(
    ctx: &mut SearchContext<'ctx>,
    conditions_interner: &mut Interner<ProximityEdge>,
+    from_node_data: &'from_data (Vec<(Option<Interned<Phrase>>, Interned<String>)>, i8),
    to_node: &QueryNode,
-    from_node_data: &'from_data (WordDerivations, i8),
 ) -> Result<Vec<(u8, EdgeCondition<ProximityEdge>)>> {
-    let SearchContext { index, txn, db_cache, word_interner, derivations_interner, .. } = ctx;
-
-    // IMPORTANT! TODO: split words support
-
-    let (derivations1, pos1) = from_node_data;
-    let term2 = match &to_node {
+    let SearchContext {
+        index,
+        txn,
+        db_cache,
+        word_interner,
+        phrase_interner,
+        derivations_interner,
+        query_term_docids: _,
+    } = ctx;
+    let right_term = match &to_node {
        QueryNode::End => return Ok(vec![(0, EdgeCondition::Unconditional)]),
        QueryNode::Deleted | QueryNode::Start => return Ok(vec![]),
        QueryNode::Term(term) => term,
    };
-    let LocatedQueryTerm { value: value2, positions: pos2 } = term2;
+    let LocatedQueryTerm { value: right_value, positions: right_positions } = right_term;

-    let (derivations2, pos2, ngram_len2) = match value2 {
-        QueryTerm::Word { derivations } => {
-            (derivations_interner.get(*derivations).clone(), *pos2.start(), pos2.len())
-        }
-        QueryTerm::Phrase { phrase: phrase2 } => {
-            let phrase2 = ctx.phrase_interner.get(*phrase2);
-            if let Some(original) = *phrase2.words.first().unwrap() {
-                (
-                    WordDerivations {
-                        original,
-                        zero_typo: Box::new([original]),
-                        one_typo: Box::new([]),
-                        two_typos: Box::new([]),
-                        use_prefix_db: false,
-                        synonyms: Box::new([]),
-                        split_words: None,
-                    },
-                    *pos2.start(),
-                    1,
-                )
-            } else {
-                // No word pairs if the phrase does not have a regular word as its first term
-                return Ok(vec![]);
+    let (right_phrase, right_derivations, right_start_position, right_ngram_length) =
+        match right_value {
+            QueryTerm::Word { derivations } => (
+                None,
+                derivations_interner.get(*derivations).clone(),
+                *right_positions.start(),
+                right_positions.len(),
+            ),
+            QueryTerm::Phrase { phrase: right_phrase_interned } => {
+                let right_phrase = phrase_interner.get(*right_phrase_interned);
+                if let Some(original) = *right_phrase.words.first().unwrap() {
+                    (
+                        Some(*right_phrase_interned),
+                        WordDerivations {
+                            original,
+                            zero_typo: Some(original),
+                            one_typo: Box::new([]),
+                            two_typos: Box::new([]),
+                            use_prefix_db: None,
+                            synonyms: Box::new([]),
+                            split_words: None,
+                            is_prefix: false,
+                            prefix_of: Box::new([]),
+                        },
+                        *right_positions.start(),
+                        1,
+                    )
+                } else {
+                    // No word pairs if the phrase does not have a regular word as its first term
+                    return Ok(vec![]);
+                }
            }
-        }
-    };
+        };

-    if pos1 + 1 != pos2 {
-        // TODO: how should this actually be handled?
-        // We want to effectively ignore this pair of terms
+    let (left_derivations, left_end_position) = from_node_data;
+
+    if left_end_position + 1 != right_start_position {
+        // We want to ignore this pair of terms
        // Unconditionally walk through the edge without computing the docids
-        // But also what should the cost be?
+        // This can happen when, in a query like `the sun flowers are beautiful`, the term
+        // `flowers` is removed by the words ranking rule due to the terms matching strategy.
+        // The remaining query graph represents `the sun .. are beautiful`
+        // but `sun` and `are` have no proximity condition between them
        return Ok(vec![(0, EdgeCondition::Unconditional)]);
    }

-    let updb1 = derivations1.use_prefix_db;
-    let updb2 = derivations2.use_prefix_db;
-
-    // left term cannot be a prefix
-    assert!(!updb1);
-
-    // TODO: IMPORTANT! split words and synonyms support
-    let derivations1 = derivations1.all_single_word_derivations_except_prefix_db();
-    // TODO: eventually, we want to get rid of the uses from `orginal`
    let mut cost_proximity_word_pairs = BTreeMap::<u8, BTreeMap<u8, Vec<WordPair>>>::new();

-    if updb2 {
-        for word1 in derivations1.clone() {
-            for proximity in 1..=(8 - ngram_len2) {
-                let cost = (proximity + ngram_len2 - 1) as u8;
-                // TODO: if we had access to the universe here, we could already check whether
-                // the bitmap corresponding to this word pair is disjoint with the universe or not
-                if db_cache
-                    .get_word_prefix_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        word1,
-                        derivations2.original,
-                        proximity as u8,
-                    )?
-                    .is_some()
-                {
-                    cost_proximity_word_pairs
-                        .entry(cost)
-                        .or_default()
-                        .entry(proximity as u8)
-                        .or_default()
-                        .push(WordPair::WordPrefix {
-                            left: word1,
-                            right_prefix: derivations2.original,
-                        });
-                }
-                if db_cache
-                    .get_prefix_word_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        derivations2.original,
-                        word1,
-                        proximity as u8 - 1,
-                    )?
-                    .is_some()
-                {
-                    cost_proximity_word_pairs
-                        .entry(cost)
-                        .or_default()
-                        .entry(proximity as u8)
-                        .or_default()
-                        .push(WordPair::WordPrefixSwapped {
-                            left_prefix: derivations2.original,
-                            right: word1,
-                        });
-                }
-            }
+    if let Some(right_prefix) = right_derivations.use_prefix_db {
+        for (left_phrase, left_word) in left_derivations.iter().copied() {
+            add_prefix_edges(
+                index,
+                txn,
+                db_cache,
+                word_interner,
+                right_ngram_length,
+                left_word,
+                right_prefix,
+                &mut cost_proximity_word_pairs,
+                left_phrase,
+            )?;
        }
    }

-    // TODO: important! support split words and synonyms as well
-    let derivations2 = derivations2.all_single_word_derivations_except_prefix_db();
    // TODO: add safeguard in case the cartesian product is too large!
    // even if we restrict the word derivations to a maximum of 100, the size of the
    // caterisan product could reach a maximum of 10_000 derivations, which is way too much.
    // mMaybe prioritise the product of zero typo derivations, then the product of zero-typo/one-typo
    // + one-typo/zero-typo, then one-typo/one-typo, then ... until an arbitrary limit has been
    // reached
-    let product_derivations = derivations1.cartesian_product(derivations2);
+    let first_word_right_phrase = if let Some(right_phrase_interned) = right_phrase {
+        let right_phrase = phrase_interner.get(right_phrase_interned);
+        right_phrase.words.first().copied().unwrap()
+    } else {
+        None
+    };
+    let right_single_word_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> =
+        right_derivations
+            .all_single_word_derivations_except_prefix_db()
+            .chain(first_word_right_phrase.iter().copied())
+            .map(|w| (right_phrase, w))
+            .collect();
+    let right_phrase_iter: Vec<(Option<Interned<Phrase>>, Interned<String>)> = right_derivations
+        .all_phrase_derivations()
+        .map(|right_phrase_interned: Interned<Phrase>| {
+            let right_phrase = phrase_interner.get(right_phrase_interned);
+            let first_word_right_phrase: Interned<String> =
+                right_phrase.words.first().unwrap().unwrap();
+            let r: (Option<Interned<Phrase>>, Interned<String>) =
+                (Some(right_phrase_interned), first_word_right_phrase);
+            r
+        })
+        .collect();
+    let mut right_word_iter = right_single_word_iter;
+    right_word_iter.extend(right_phrase_iter);

-    for (word1, word2) in product_derivations {
-        for proximity in 1..=(8 - ngram_len2) {
-            let cost = (proximity + ngram_len2 - 1) as u8;
-            if db_cache
-                .get_word_pair_proximity_docids(
-                    index,
-                    txn,
-                    word_interner,
-                    word1,
-                    word2,
-                    proximity as u8,
-                )?
-                .is_some()
-            {
-                cost_proximity_word_pairs
-                    .entry(cost)
-                    .or_default()
-                    .entry(proximity as u8)
-                    .or_default()
-                    .push(WordPair::Words { left: word1, right: word2 });
-            }
-            if proximity > 1
-                && db_cache
-                    .get_word_pair_proximity_docids(
-                        index,
-                        txn,
-                        word_interner,
-                        word2,
-                        word1,
-                        proximity as u8 - 1,
-                    )?
-                    .is_some()
-            {
-                cost_proximity_word_pairs
-                    .entry(cost)
-                    .or_default()
-                    .entry(proximity as u8 - 1)
-                    .or_default()
-                    .push(WordPair::Words { left: word2, right: word1 });
-            }
+    for (left_phrase, left_word) in left_derivations.iter().copied() {
+        for (right_phrase, right_word) in right_word_iter.iter().copied() {
+            add_non_prefix_edges(
+                index,
+                txn,
+                db_cache,
+                word_interner,
+                right_ngram_length,
+                left_word,
+                right_word,
+                &mut cost_proximity_word_pairs,
+                &[left_phrase, right_phrase].iter().copied().flatten().collect::<Vec<_>>(),
+            )?;
        }
    }
+
    let mut new_edges =
        cost_proximity_word_pairs
            .into_iter()
@ -243,6 +234,124 @@ pub fn visit_to_node<'ctx, 'from_data>(
                edges
            })
            .collect::<Vec<_>>();
-    new_edges.push((8 + (ngram_len2 - 1) as u8, EdgeCondition::Unconditional));
+    new_edges.push((8 + (right_ngram_length - 1) as u8, EdgeCondition::Unconditional));
    Ok(new_edges)
 }
+
+fn add_prefix_edges<'ctx>(
+    index: &mut &crate::Index,
+    txn: &'ctx RoTxn,
+    db_cache: &mut DatabaseCache<'ctx>,
+    word_interner: &mut Interner<String>,
+    right_ngram_length: usize,
+    left_word: Interned<String>,
+    right_prefix: Interned<String>,
+    cost_proximity_word_pairs: &mut BTreeMap<u8, BTreeMap<u8, Vec<WordPair>>>,
+    left_phrase: Option<Interned<Phrase>>,
+) -> Result<()> {
+    for proximity in 1..=(8 - right_ngram_length) {
+        let cost = (proximity + right_ngram_length - 1) as u8;
+        // TODO: if we had access to the universe here, we could already check whether
+        // the bitmap corresponding to this word pair is disjoint with the universe or not
+        if db_cache
+            .get_word_prefix_pair_proximity_docids(
+                index,
+                txn,
+                word_interner,
+                left_word,
+                right_prefix,
+                proximity as u8,
+            )?
+            .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8)
+                .or_default()
+                .push(WordPair::WordPrefix {
+                    phrases: left_phrase.into_iter().collect(),
+                    left: left_word,
+                    right_prefix,
+                });
+        }
+
+        // No swapping when computing the proximity between a phrase and a word
+        if left_phrase.is_none()
+            && db_cache
+                .get_prefix_word_pair_proximity_docids(
+                    index,
+                    txn,
+                    word_interner,
+                    right_prefix,
+                    left_word,
+                    proximity as u8 - 1,
+                )?
+                .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8)
+                .or_default()
+                .push(WordPair::WordPrefixSwapped { left_prefix: right_prefix, right: left_word });
+        }
+    }
+    Ok(())
+}
+
+fn add_non_prefix_edges<'ctx>(
+    index: &mut &crate::Index,
+    txn: &'ctx RoTxn,
+    db_cache: &mut DatabaseCache<'ctx>,
+    word_interner: &mut Interner<String>,
+    right_ngram_length: usize,
+    word1: Interned<String>,
+    word2: Interned<String>,
+    cost_proximity_word_pairs: &mut BTreeMap<u8, BTreeMap<u8, Vec<WordPair>>>,
+    phrases: &[Interned<Phrase>],
+) -> Result<()> {
+    for proximity in 1..=(8 - right_ngram_length) {
+        let cost = (proximity + right_ngram_length - 1) as u8;
+        if db_cache
+            .get_word_pair_proximity_docids(
+                index,
+                txn,
+                word_interner,
+                word1,
+                word2,
+                proximity as u8,
+            )?
+            .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8)
+                .or_default()
+                .push(WordPair::Words { phrases: phrases.to_vec(), left: word1, right: word2 });
+        }
+        if proximity > 1
+            // no swapping when either term is a phrase
+            && phrases.is_empty()
+            && db_cache
+                .get_word_pair_proximity_docids(
+                    index,
+                    txn,
+                    word_interner,
+                    word2,
+                    word1,
+                    proximity as u8 - 1,
+                )?
+                .is_some()
+        {
+            cost_proximity_word_pairs
+                .entry(cost)
+                .or_default()
+                .entry(proximity as u8 - 1)
+                .or_default()
+                .push(WordPair::Words { phrases: vec![], left: word2, right: word1 });
+        }
+    }
+    Ok(())
+}
--- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs
@ -13,24 +13,61 @@ pub fn compute_docids<'ctx>(
    let ProximityEdge { pairs, proximity } = edge;
    let mut pair_docids = RoaringBitmap::new();
    for pair in pairs.iter() {
-        let bytes = match pair {
-            WordPair::Words { left, right } => db_cache.get_word_pair_proximity_docids(
-                index,
-                txn,
-                word_interner,
-                *left,
-                *right,
-                *proximity,
-            ),
-            WordPair::WordPrefix { left, right_prefix } => db_cache
-                .get_word_prefix_pair_proximity_docids(
-                    index,
-                    txn,
-                    word_interner,
-                    *left,
-                    *right_prefix,
-                    *proximity,
-                ),
+        let pair = match pair {
+            WordPair::Words { phrases, left, right } => {
+                let mut docids = db_cache
+                    .get_word_pair_proximity_docids(
+                        index,
+                        txn,
+                        word_interner,
+                        *left,
+                        *right,
+                        *proximity,
+                    )?
+                    .map(CboRoaringBitmapCodec::deserialize_from)
+                    .transpose()?
+                    .unwrap_or_default();
+                if !docids.is_empty() {
+                    for phrase in phrases {
+                        docids &= ctx.query_term_docids.get_phrase_docids(
+                            index,
+                            txn,
+                            db_cache,
+                            word_interner,
+                            &ctx.phrase_interner,
+                            *phrase,
+                        )?;
+                    }
+                }
+                docids
+            }
+            WordPair::WordPrefix { phrases, left, right_prefix } => {
+                let mut docids = db_cache
+                    .get_word_prefix_pair_proximity_docids(
+                        index,
+                        txn,
+                        word_interner,
+                        *left,
+                        *right_prefix,
+                        *proximity,
+                    )?
+                    .map(CboRoaringBitmapCodec::deserialize_from)
+                    .transpose()?
+                    .unwrap_or_default();
+                if !docids.is_empty() {
+                    for phrase in phrases {
+                        docids &= ctx.query_term_docids.get_phrase_docids(
+                            index,
+                            txn,
+                            db_cache,
+                            word_interner,
+                            &ctx.phrase_interner,
+                            *phrase,
+                        )?;
+                    }
+                }
+                docids
+            }
            WordPair::WordPrefixSwapped { left_prefix, right } => db_cache
                .get_prefix_word_pair_proximity_docids(
                    index,
@ -39,11 +76,13 @@ pub fn compute_docids<'ctx>(
                    *left_prefix,
                    *right,
                    *proximity,
-                ),
-        }?;
-        // TODO: deserialize bitmap within a universe, and (maybe) using a bump allocator?
-        let bitmap = universe
-            & bytes.map(CboRoaringBitmapCodec::deserialize_from).transpose()?.unwrap_or_default();
+                )?
+                .map(CboRoaringBitmapCodec::deserialize_from)
+                .transpose()?
+                .unwrap_or_default(),
+        };
+        // TODO: deserialize bitmap within a universe
+        let bitmap = universe & pair;
        pair_docids |= bitmap;
    }

--- a/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/proximity/mod.rs
@ -7,16 +7,27 @@ use super::empty_paths_cache::EmptyPathsCache;
 use super::{EdgeCondition, RankingRuleGraphTrait};
 use crate::search::new::interner::{Interned, Interner};
 use crate::search::new::logger::SearchLogger;
-use crate::search::new::query_term::WordDerivations;
+use crate::search::new::query_term::Phrase;
 use crate::search::new::small_bitmap::SmallBitmap;
 use crate::search::new::{QueryGraph, QueryNode, SearchContext};
 use crate::Result;

 #[derive(Clone, PartialEq, Eq, Hash)]
 pub enum WordPair {
-    Words { left: Interned<String>, right: Interned<String> },
-    WordPrefix { left: Interned<String>, right_prefix: Interned<String> },
-    WordPrefixSwapped { left_prefix: Interned<String>, right: Interned<String> },
+    Words {
+        phrases: Vec<Interned<Phrase>>,
+        left: Interned<String>,
+        right: Interned<String>,
+    },
+    WordPrefix {
+        phrases: Vec<Interned<Phrase>>,
+        left: Interned<String>,
+        right_prefix: Interned<String>,
+    },
+    WordPrefixSwapped {
+        left_prefix: Interned<String>,
+        right: Interned<String>,
+    },
 }

 #[derive(Clone, PartialEq, Eq, Hash)]
@ -29,7 +40,7 @@ pub enum ProximityGraph {}

 impl RankingRuleGraphTrait for ProximityGraph {
    type EdgeCondition = ProximityEdge;
-    type BuildVisitedFromNode = (WordDerivations, i8);
+    type BuildVisitedFromNode = (Vec<(Option<Interned<Phrase>>, Interned<String>)>, i8);

    fn label_for_edge_condition(edge: &Self::EdgeCondition) -> String {
        let ProximityEdge { pairs, proximity } = edge;
@ -54,10 +65,15 @@ impl RankingRuleGraphTrait for ProximityGraph {
    fn build_step_visit_destination_node<'from_data, 'ctx: 'from_data>(
        ctx: &mut SearchContext<'ctx>,
        conditions_interner: &mut Interner<Self::EdgeCondition>,
-        to_node: &QueryNode,
-        from_node_data: &'from_data Self::BuildVisitedFromNode,
+        dest_node: &QueryNode,
+        source_node_data: &'from_data Self::BuildVisitedFromNode,
    ) -> Result<Vec<(u8, EdgeCondition<Self::EdgeCondition>)>> {
-        build::visit_to_node(ctx, conditions_interner, to_node, from_node_data)
+        build::build_step_visit_destination_node(
+            ctx,
+            conditions_interner,
+            source_node_data,
+            dest_node,
+        )
    }

    fn log_state(
--- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
@ -84,7 +84,7 @@ impl RankingRuleGraphTrait for TypoGraph {
    ) -> Result<Vec<(u8, EdgeCondition<Self::EdgeCondition>)>> {
        let SearchContext { derivations_interner, .. } = ctx;
        match to_node {
-            QueryNode::Term(LocatedQueryTerm { value, .. }) => match *value {
+            QueryNode::Term(LocatedQueryTerm { value, positions }) => match *value {
                QueryTerm::Phrase { phrase } => Ok(vec![(
                    0,
                    EdgeCondition::Conditional(
@ -93,57 +93,62 @@ impl RankingRuleGraphTrait for TypoGraph {
                )]),
                QueryTerm::Word { derivations } => {
                    let mut edges = vec![];
+                    // Ngrams have a base typo cost
+                    // 2-gram -> equivalent to 1 typo
+                    // 3-gram -> equivalent to 2 typos
+                    let base_cost = positions.len().max(2) as u8;

                    for nbr_typos in 0..=2 {
                        let derivations = derivations_interner.get(derivations).clone();
                        let new_derivations = match nbr_typos {
-                            0 => {
-                                // TODO: think about how split words and synonyms should be handled here
-                                // TODO: what about ngrams?
-                                // Maybe 2grams should have one typo by default and 3grams 2 typos by default
-                                WordDerivations {
-                                    original: derivations.original,
-                                    synonyms: derivations.synonyms,
-                                    split_words: None,
-                                    zero_typo: derivations.zero_typo,
-                                    one_typo: Box::new([]),
-                                    two_typos: Box::new([]),
-                                    use_prefix_db: derivations.use_prefix_db,
-                                }
-                            }
+                            0 => WordDerivations {
+                                original: derivations.original,
+                                is_prefix: derivations.is_prefix,
+                                zero_typo: derivations.zero_typo,
+                                prefix_of: derivations.prefix_of,
+                                synonyms: derivations.synonyms,
+                                split_words: None,
+                                one_typo: Box::new([]),
+                                two_typos: Box::new([]),
+                                use_prefix_db: derivations.use_prefix_db,
+                            },
                            1 => {
                                // What about split words and synonyms here?
                                WordDerivations {
                                    original: derivations.original,
+                                    is_prefix: false,
+                                    zero_typo: None,
+                                    prefix_of: Box::new([]),
                                    synonyms: Box::new([]),
                                    split_words: derivations.split_words,
-                                    zero_typo: Box::new([]),
                                    one_typo: derivations.one_typo,
                                    two_typos: Box::new([]),
-                                    use_prefix_db: false, // false because all items from use_prefix_db haev 0 typos
+                                    use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
                                }
                            }
                            2 => {
                                // What about split words and synonyms here?
                                WordDerivations {
                                    original: derivations.original,
+                                    zero_typo: None,
+                                    is_prefix: false,
+                                    prefix_of: Box::new([]),
                                    synonyms: Box::new([]),
                                    split_words: None,
-                                    zero_typo: Box::new([]),
                                    one_typo: Box::new([]),
                                    two_typos: derivations.two_typos,
-                                    use_prefix_db: false, // false because all items from use_prefix_db haev 0 typos
+                                    use_prefix_db: None, // false because all items from use_prefix_db have 0 typos
                                }
                            }
                            _ => panic!(),
                        };
                        if !new_derivations.is_empty() {
                            edges.push((
-                                nbr_typos,
+                                nbr_typos as u8 + base_cost,
                                EdgeCondition::Conditional(conditions_interner.insert(
                                    TypoEdge::Word {
                                        derivations: derivations_interner.insert(new_derivations),
-                                        nbr_typos,
+                                        nbr_typos: nbr_typos as u8,
                                    },
                                )),
                            ))
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@ -54,25 +54,31 @@ impl QueryTermDocIdsCache {
            return Ok(&self.derivations[&derivations]);
        };
        let WordDerivations {
-            original,
+            original: _,
+            is_prefix: _,
+            zero_typo,
+            prefix_of,
            synonyms,
            split_words,
-            zero_typo,
            one_typo,
            two_typos,
            use_prefix_db,
        } = derivations_interner.get(derivations);
        let mut or_docids = vec![];
-        for word in zero_typo.iter().chain(one_typo.iter()).chain(two_typos.iter()).copied() {
+        for word in zero_typo
+            .iter()
+            .chain(prefix_of.iter())
+            .chain(one_typo.iter())
+            .chain(two_typos.iter())
+            .copied()
+        {
            if let Some(word_docids) = db_cache.get_word_docids(index, txn, word_interner, word)? {
                or_docids.push(word_docids);
            }
        }
-        if *use_prefix_db {
-            // TODO: this will change if we decide to change from (original, zero_typo) to:
-            // (debug_original, prefix_of, zero_typo)
+        if let Some(prefix) = use_prefix_db {
            if let Some(prefix_docids) =
-                db_cache.get_word_prefix_docids(index, txn, word_interner, *original)?
+                db_cache.get_word_prefix_docids(index, txn, word_interner, *prefix)?
            {
                or_docids.push(prefix_docids);
            }