Don't compute split_words for phrases

2025-05-25 09:03:59 +02:00 · 2023-05-16 16:22:23 +02:00 · 2023-05-16 16:22:23 +02:00 · 5758268866
commit 5758268866
parent a37da36766
5 changed files with 45 additions and 17 deletions
--- a/milli/src/search/new/logger/visual.rs
+++ b/milli/src/search/new/logger/visual.rs
@ -462,7 +462,7 @@ fill: \"#B6E2D3\"
                shape: class
                max_nbr_typo: {}",
                    term_subset.description(ctx),
-                    term_subset.max_nbr_typos(ctx)
+                    term_subset.max_typo_cost(ctx)
                )?;

                for w in term_subset.all_single_words_except_prefix_db(ctx)? {
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@ -28,14 +28,14 @@ pub enum ZeroOrOneTypo {
 impl Interned<QueryTerm> {
    pub fn compute_fully_if_needed(self, ctx: &mut SearchContext) -> Result<()> {
        let s = ctx.term_interner.get_mut(self);
-        if s.max_nbr_typos <= 1 && s.one_typo.is_uninit() {
+        if s.max_levenshtein_distance <= 1 && s.one_typo.is_uninit() {
            assert!(s.two_typo.is_uninit());
            // Initialize one_typo subterm even if max_nbr_typo is 0 because of split words
            self.initialize_one_typo_subterm(ctx)?;
            let s = ctx.term_interner.get_mut(self);
            assert!(s.one_typo.is_init());
            s.two_typo = Lazy::Init(TwoTypoTerm::default());
-        } else if s.max_nbr_typos > 1 && s.two_typo.is_uninit() {
+        } else if s.max_levenshtein_distance > 1 && s.two_typo.is_uninit() {
            assert!(s.two_typo.is_uninit());
            self.initialize_one_and_two_typo_subterm(ctx)?;
            let s = ctx.term_interner.get_mut(self);
@ -185,7 +185,7 @@ pub fn partially_initialized_term_from_word(
                original: ctx.word_interner.insert(word.to_owned()),
                ngram_words: None,
                is_prefix: false,
-                max_nbr_typos: 0,
+                max_levenshtein_distance: 0,
                zero_typo: <_>::default(),
                one_typo: Lazy::Init(<_>::default()),
                two_typo: Lazy::Init(<_>::default()),
@ -256,7 +256,7 @@ pub fn partially_initialized_term_from_word(
    Ok(QueryTerm {
        original: word_interned,
        ngram_words: None,
-        max_nbr_typos: max_typo,
+        max_levenshtein_distance: max_typo,
        is_prefix,
        zero_typo,
        one_typo: Lazy::Uninit,
@ -275,7 +275,16 @@ fn find_split_words(ctx: &mut SearchContext, word: &str) -> Result<Option<Intern
 impl Interned<QueryTerm> {
    fn initialize_one_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
        let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, one_typo, max_nbr_typos, .. } = self_mut;
+
+        let allows_split_words = self_mut.allows_split_words();
+        let QueryTerm {
+            original,
+            is_prefix,
+            one_typo,
+            max_levenshtein_distance: max_nbr_typos,
+            ..
+        } = self_mut;
+
        let original = *original;
        let is_prefix = *is_prefix;
        // let original_str = ctx.word_interner.get(*original).to_owned();
@ -300,13 +309,17 @@ impl Interned<QueryTerm> {
            })?;
        }

+        let split_words = if allows_split_words {
            let original_str = ctx.word_interner.get(original).to_owned();
-        let split_words = find_split_words(ctx, original_str.as_str())?;
+            find_split_words(ctx, original_str.as_str())?
+        } else {
+            None
+        };

        let self_mut = ctx.term_interner.get_mut(self);

        // Only add the split words to the derivations if:
-        // 1. the term is not an ngram; OR
+        // 1. the term is neither an ngram nor a phrase; OR
        // 2. the term is an ngram, but the split words are different from the ngram's component words
        let split_words = if let Some((ngram_words, split_words)) =
            self_mut.ngram_words.as_ref().zip(split_words.as_ref())
@ -328,7 +341,13 @@ impl Interned<QueryTerm> {
    }
    fn initialize_one_and_two_typo_subterm(self, ctx: &mut SearchContext) -> Result<()> {
        let self_mut = ctx.term_interner.get_mut(self);
-        let QueryTerm { original, is_prefix, two_typo, max_nbr_typos, .. } = self_mut;
+        let QueryTerm {
+            original,
+            is_prefix,
+            two_typo,
+            max_levenshtein_distance: max_nbr_typos,
+            ..
+        } = self_mut;
        let original_str = ctx.word_interner.get(*original).to_owned();
        if two_typo.is_init() {
            return Ok(());
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@ -43,7 +43,7 @@ pub struct QueryTermSubset {
 pub struct QueryTerm {
    original: Interned<String>,
    ngram_words: Option<Vec<Interned<String>>>,
-    max_nbr_typos: u8,
+    max_levenshtein_distance: u8,
    is_prefix: bool,
    zero_typo: ZeroTypoTerm,
    // May not be computed yet
@ -342,10 +342,16 @@ impl QueryTermSubset {
        }
        None
    }
-    pub fn max_nbr_typos(&self, ctx: &SearchContext) -> u8 {
+    pub fn max_typo_cost(&self, ctx: &SearchContext) -> u8 {
        let t = ctx.term_interner.get(self.original);
-        match t.max_nbr_typos {
-            0 => 0,
+        match t.max_levenshtein_distance {
+            0 => {
+                if t.allows_split_words() {
+                    1
+                } else {
+                    0
+                }
+            }
            1 => {
                if self.one_typo_subset.is_empty() {
                    0
@ -438,6 +444,9 @@ impl QueryTerm {

        self.zero_typo.is_empty() && one_typo.is_empty() && two_typo.is_empty()
    }
+    fn allows_split_words(&self) -> bool {
+        self.zero_typo.phrase.is_none()
+    }
 }

 impl Interned<QueryTerm> {
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -217,7 +217,7 @@ pub fn make_ngram(
        original: ngram_str_interned,
        ngram_words: Some(words_interned),
        is_prefix,
-        max_nbr_typos,
+        max_levenshtein_distance: max_nbr_typos,
        zero_typo: term.zero_typo,
        one_typo: Lazy::Uninit,
        two_typo: Lazy::Uninit,
@ -271,7 +271,7 @@ impl PhraseBuilder {
                QueryTerm {
                    original: ctx.word_interner.insert(phrase_desc),
                    ngram_words: None,
-                    max_nbr_typos: 0,
+                    max_levenshtein_distance: 0,
                    is_prefix: false,
                    zero_typo: ZeroTypoTerm {
                        phrase: Some(phrase),
--- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs
@ -50,7 +50,7 @@ impl RankingRuleGraphTrait for TypoGraph {
        // 3-gram -> equivalent to 2 typos
        let base_cost = if term.term_ids.len() == 1 { 0 } else { term.term_ids.len() as u32 };

-        for nbr_typos in 0..=term.term_subset.max_nbr_typos(ctx) {
+        for nbr_typos in 0..=term.term_subset.max_typo_cost(ctx) {
            let mut term = term.clone();
            match nbr_typos {
                0 => {