From 7ab48ed8c772c8ff49eb0cb95759fc69de53d925 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Tue, 11 Apr 2023 15:41:44 +0200
Subject: [PATCH] Matching words fixes

---
 milli/src/search/new/logger/mod.rs            |  2 -
 .../src/search/new/matches/matching_words.rs  | 64 +------------
 milli/src/search/new/matches/mod.rs           |  4 +-
 milli/src/search/new/mod.rs                   | 17 ++--
 milli/src/search/new/query_term/mod.rs        | 94 +++++++++++++++----
 5 files changed, 94 insertions(+), 87 deletions(-)
diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs
index 15cb78784..889e811ad 100644
--- a/milli/src/search/new/logger/mod.rs
+++ b/milli/src/search/new/logger/mod.rs
@@ -1,8 +1,6 @@
 // #[cfg(test)]
 pub mod detailed;
 
-pub mod test_logger;
-
 use roaring::RoaringBitmap;
 
 use super::interner::{Interned, MappedInterner};
diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs
index d5d1b6906..0da1b3a78 100644
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@@ -5,9 +5,7 @@ use std::ops::RangeInclusive;
 use charabia::Token;
 
 use super::super::interner::Interned;
-use super::super::query_term::{
-    Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm,
-};
+use super::super::query_term::LocatedQueryTerm;
 use super::super::{DedupInterner, Phrase};
 use crate::SearchContext;
 
@@ -33,68 +31,16 @@ pub struct MatchingWords {
     words: Vec<LocatedMatchingWords>,
 }
 
-/// Extract and centralize the different phrases and words to match stored in a QueryTerm.
-fn extract_matching_terms(term: &QueryTerm) -> (Vec<Interned<Phrase>>, Vec<Interned<String>>) {
-    let mut matching_words = Vec::new();
-    let mut matching_phrases = Vec::new();
-
-    // the structure is exhaustively extracted to ensure that no field is missing.
-    let QueryTerm {
-        original: _,
-        is_multiple_words: _,
-        max_nbr_typos: _,
-        is_prefix: _,
-        zero_typo,
-        one_typo,
-        two_typo,
-    } = term;
-
-    // the structure is exhaustively extracted to ensure that no field is missing.
-    let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo;
-
-    // zero typo
-    if let Some(phrase) = phrase {
-        matching_phrases.push(*phrase);
-    }
-    if let Some(zero_typo) = zero_typo {
-        matching_words.push(*zero_typo);
-    }
-    for synonym in synonyms {
-        matching_phrases.push(*synonym);
-    }
-
-    // one typo
-    // the structure is exhaustively extracted to ensure that no field is missing.
-    if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo {
-        if let Some(split_words) = split_words {
-            matching_phrases.push(*split_words);
-        }
-        for one_typo in one_typo {
-            matching_words.push(*one_typo);
-        }
-    }
-
-    // two typos
-    // the structure is exhaustively extracted to ensure that no field is missing.
-    if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo {
-        for two_typos in two_typos {
-            matching_words.push(*two_typos);
-        }
-    }
-
-    (matching_phrases, matching_words)
-}
-
 impl MatchingWords {
     pub fn new(ctx: SearchContext, located_terms: Vec<LocatedQueryTerm>) -> Self {
         let mut phrases = Vec::new();
         let mut words = Vec::new();
 
-        // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms
+        // Extract and centralize the different phrases and words to match stored in a QueryTerm
         // and wrap them in dedicated structures.
         for located_term in located_terms {
             let term = ctx.term_interner.get(located_term.value);
-            let (matching_phrases, matching_words) = extract_matching_terms(term);
+            let (matching_words, matching_phrases) = term.all_computed_derivations();
 
             for matching_phrase in matching_phrases {
                 phrases.push(LocatedMatchingPhrase {
@@ -106,8 +52,8 @@ impl MatchingWords {
             words.push(LocatedMatchingWords {
                 value: matching_words,
                 positions: located_term.positions.clone(),
-                is_prefix: term.is_prefix,
-                original_char_count: ctx.word_interner.get(term.original).chars().count(),
+                is_prefix: term.is_cached_prefix(),
+                original_char_count: term.original_word(&ctx).chars().count(),
             });
         }
 
diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs
index 8dded0cab..84bdea7ab 100644
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@@ -137,7 +137,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
                     }
                     // partial match is now full, we keep this matches and we advance positions
                     Some(MatchType::Full { char_len, ids }) => {
-                        let ids: Vec<_> = ids.clone().into_iter().collect();
+                        let ids: Vec<_> = ids.clone().collect();
                         // save previously matched tokens as matches.
                         let iter = potential_matches.into_iter().map(
                             |(token_position, word_position, match_len)| Match {
@@ -192,7 +192,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> {
                     // we match, we save the current token as a match,
                     // then we continue the rest of the tokens.
                     MatchType::Full { char_len, ids } => {
-                        let ids: Vec<_> = ids.clone().into_iter().collect();
+                        let ids: Vec<_> = ids.clone().collect();
                         matches.push(Match {
                             match_len: char_len,
                             ids,
diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs
index 0445b3e94..e07a17029 100644
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@@ -35,20 +35,20 @@ pub use logger::detailed::DetailedSearchLogger;
 pub use logger::{DefaultSearchLogger, SearchLogger};
 use query_graph::{QueryGraph, QueryNode};
 use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm};
-use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
+use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait};
 use resolve_query_graph::PhraseDocIdsCache;
 use roaring::RoaringBitmap;
 use words::Words;
 
-use self::bucket_sort::BucketSortOutput;
-use self::exact_attribute::ExactAttribute;
-use self::graph_based_ranking_rule::Exactness;
-use self::interner::Interner;
-use self::ranking_rules::{BoxRankingRule, RankingRule};
-use self::resolve_query_graph::compute_query_graph_docids;
-use self::sort::Sort;
 use crate::search::new::distinct::apply_distinct_rule;
 use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError};
+use bucket_sort::BucketSortOutput;
+use exact_attribute::ExactAttribute;
+use graph_based_ranking_rule::Exactness;
+use interner::Interner;
+use ranking_rules::{BoxRankingRule, RankingRule};
+use resolve_query_graph::compute_query_graph_docids;
+use sort::Sort;
 
 /// A structure used throughout the execution of a search query.
 pub struct SearchContext<'ctx> {
@@ -361,6 +361,7 @@ pub fn execute_search(
     Ok(PartialSearchResult {
         candidates: all_candidates,
         documents_ids: docids,
+        located_query_terms,
     })
 }
 
diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs
index 896c70e1b..83320139b 100644
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@@ -188,17 +188,35 @@ impl QueryTermSubset {
         }
 
         let original = ctx.term_interner.get_mut(self.original);
-        if !self.zero_typo_subset.is_empty() {
-            let ZeroTypoTerm {
-                phrase: _,
-                exact: zero_typo,
-                prefix_of,
-                synonyms: _,
-                use_prefix_db: _,
-            } = &original.zero_typo;
-            result.extend(zero_typo.iter().copied());
-            result.extend(prefix_of.iter().copied());
-        };
+        match &self.zero_typo_subset {
+            NTypoTermSubset::All => {
+                let ZeroTypoTerm {
+                    phrase: _,
+                    exact: zero_typo,
+                    prefix_of,
+                    synonyms: _,
+                    use_prefix_db: _,
+                } = &original.zero_typo;
+                result.extend(zero_typo.iter().copied());
+                result.extend(prefix_of.iter().copied());
+            }
+            NTypoTermSubset::Subset { words, phrases: _ } => {
+                let ZeroTypoTerm {
+                    phrase: _,
+                    exact: zero_typo,
+                    prefix_of,
+                    synonyms: _,
+                    use_prefix_db: _,
+                } = &original.zero_typo;
+                if let Some(zero_typo) = zero_typo {
+                    if words.contains(zero_typo) {
+                        result.insert(*zero_typo);
+                    }
+                }
+                result.extend(prefix_of.intersection(words).copied());
+            }
+            NTypoTermSubset::Nothing => {}
+        }
 
         match &self.one_typo_subset {
             NTypoTermSubset::All => {
@@ -248,11 +266,24 @@ impl QueryTermSubset {
         result.extend(phrase.iter().copied());
         result.extend(synonyms.iter().copied());
 
-        if !self.one_typo_subset.is_empty() {
-            let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
-                panic!();
-            };
-            result.extend(split_words.iter().copied());
+        match &self.one_typo_subset {
+            NTypoTermSubset::All => {
+                let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
+                    panic!();
+                };
+                result.extend(split_words.iter().copied());
+            }
+            NTypoTermSubset::Subset { phrases, .. } => {
+                let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else {
+                    panic!();
+                };
+                if let Some(split_words) = split_words {
+                    if phrases.contains(split_words) {
+                        result.insert(*split_words);
+                    }
+                }
+            }
+            NTypoTermSubset::Nothing => {}
         }
 
         Ok(result)
@@ -368,3 +399,34 @@ impl LocatedQueryTerm {
         interner.get(self.value).is_empty()
     }
 }
+
+impl QueryTerm {
+    pub fn is_cached_prefix(&self) -> bool {
+        self.zero_typo.use_prefix_db.is_some()
+    }
+    pub fn original_word(&self, ctx: &SearchContext) -> String {
+        ctx.word_interner.get(self.original).clone()
+    }
+    pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
+        let mut words = BTreeSet::new();
+        let mut phrases = BTreeSet::new();
+
+        let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } =
+            &self.zero_typo;
+        words.extend(zero_typo.iter().copied());
+        words.extend(prefix_of.iter().copied());
+        phrases.extend(phrase.iter().copied());
+        phrases.extend(synonyms.iter().copied());
+
+        if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo {
+            words.extend(one_typo.iter().copied());
+            phrases.extend(split_words.iter().copied());
+        };
+
+        if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo {
+            words.extend(two_typos.iter().copied());
+        };
+
+        (words.into_iter().collect(), phrases.into_iter().collect())
+    }
+}