Decide to use prefix DB if the word is not an ngram

2025-06-24 07:28:29 +02:00 · 2023-04-12 15:14:00 +02:00 · 2023-04-12 15:14:00 +02:00 · 38b7b31beb
commit 38b7b31beb
parent 7a01f20df7
2 changed files with 23 additions and 8 deletions
--- a/milli/src/search/new/query_term/compute_derivations.rs
+++ b/milli/src/search/new/query_term/compute_derivations.rs
@ -177,6 +177,7 @@ pub fn partially_initialized_term_from_word(
    word: &str,
    max_typo: u8,
    is_prefix: bool,
+    is_ngram: bool,
 ) -> Result<QueryTerm> {
    let word_interned = ctx.word_interner.insert(word.to_owned());

@ -197,12 +198,19 @@ pub fn partially_initialized_term_from_word(
    let fst = ctx.index.words_fst(ctx.txn)?;

    let use_prefix_db = is_prefix
-        && ctx
+        && (ctx
            .index
            .word_prefix_docids
            .remap_data_type::<DecodeIgnore>()
            .get(ctx.txn, word)?
-            .is_some();
+            .is_some()
+            || (!is_ngram
+                && ctx
+                    .index
+                    .exact_word_prefix_docids
+                    .remap_data_type::<DecodeIgnore>()
+                    .get(ctx.txn, word)?
+                    .is_some()));
    let use_prefix_db = if use_prefix_db { Some(word_interned) } else { None };

    let mut zero_typo = None;
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -1,8 +1,8 @@
-use charabia::{normalizer::NormalizedTokenIter, SeparatorKind, TokenKind};
-
-use crate::{Result, SearchContext, MAX_WORD_LENGTH};
+use charabia::normalizer::NormalizedTokenIter;
+use charabia::{SeparatorKind, TokenKind};

 use super::*;
+use crate::{Result, SearchContext, MAX_WORD_LENGTH};

 /// Convert the tokenised search query into a list of located query terms.
 // TODO: checking if the positions are correct for phrases, separators, ngrams
@ -51,6 +51,7 @@ pub fn located_query_terms_from_string(
                                word,
                                nbr_typos(word),
                                false,
+                                false,
                            )?;
                            let located_term = LocatedQueryTerm {
                                value: ctx.term_interner.push(term),
@ -62,8 +63,13 @@ pub fn located_query_terms_from_string(
                    }
                } else {
                    let word = token.lemma();
-                    let term =
-                        partially_initialized_term_from_word(ctx, word, nbr_typos(word), true)?;
+                    let term = partially_initialized_term_from_word(
+                        ctx,
+                        word,
+                        nbr_typos(word),
+                        true,
+                        false,
+                    )?;
                    let located_term = LocatedQueryTerm {
                        value: ctx.term_interner.push(term),
                        positions: position..=position,
@ -195,7 +201,8 @@ pub fn make_ngram(
    let max_nbr_typos =
        number_of_typos_allowed(ngram_str.as_str()).saturating_sub(terms.len() as u8 - 1);

-    let mut term = partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix)?;
+    let mut term =
+        partially_initialized_term_from_word(ctx, &ngram_str, max_nbr_typos, is_prefix, true)?;

    // Now add the synonyms
    let index_synonyms = ctx.index.synonyms(ctx.txn)?;