Integrate the new Highlighter in the search

2025-07-03 03:47:02 +02:00 · 2023-04-06 13:58:56 +02:00 · 2023-04-06 13:58:56 +02:00 · 9c5f64769a
commit 9c5f64769a
parent ebe23b04c9
6 changed files with 77 additions and 41 deletions
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@ -97,8 +97,8 @@ pub use self::heed_codec::{
 };
 pub use self::index::Index;
 pub use self::search::{
-    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord,
-    MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
+    FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search,
+    SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET,
 };

 pub type Result<T> = std::result::Result<T, error::Error>;
--- a/milli/src/search/matches/matching_words.rs
+++ b/milli/src/search/matches/matching_words.rs
@ -289,8 +289,7 @@ mod tests {

    use charabia::TokenKind;

-    use super::*;
-    use crate::MatchingWords;
+    use super::{MatchingWords, *};

    #[test]
    fn test_bytes_to_highlight() {
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -5,9 +5,7 @@ use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;

 pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET};
-pub use self::matches::{
-    FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords,
-};
+pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords};
 use crate::{
    execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext,
 };
@ -109,9 +107,9 @@ impl<'a> Search<'a> {
    }

    pub fn execute(&self) -> Result<SearchResult> {
-        let mut ctx = SearchContext::new(self.index, self.rtxn);
+        let ctx = SearchContext::new(self.index, self.rtxn);
        execute_search(
-            &mut ctx,
+            ctx,
            &self.query,
            self.terms_matching_strategy,
            self.exhaustive_number_hits,
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@ -1,4 +1,5 @@
 use std::cmp::Reverse;
+use std::fmt;
 use std::ops::RangeInclusive;

 use charabia::Token;
@ -23,6 +24,7 @@ pub struct LocatedMatchingWords {

 /// Structure created from a query tree
 /// referencing words that match the given query tree.
+#[derive(Default)]
 pub struct MatchingWords {
    word_interner: DedupInterner<String>,
    phrase_interner: DedupInterner<Phrase>,
@ -240,6 +242,40 @@ impl<'a> PartialMatch<'a> {
    }
 }

+impl fmt::Debug for MatchingWords {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        let MatchingWords { word_interner, phrase_interner, phrases, words } = self;
+
+        let phrases: Vec<_> = phrases
+            .iter()
+            .map(|p| {
+                (
+                    phrase_interner
+                        .get(p.value)
+                        .words
+                        .iter()
+                        .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w)))
+                        .collect::<Vec<_>>()
+                        .join(" "),
+                    p.positions.clone(),
+                )
+            })
+            .collect();
+
+        let words: Vec<_> = words
+            .iter()
+            .flat_map(|w| {
+                w.value
+                    .iter()
+                    .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix))
+                    .collect::<Vec<_>>()
+            })
+            .collect();
+
+        f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish()
+    }
+}
+
 #[cfg(test)]
 pub(crate) mod tests {
    use std::borrow::Cow;
--- a/milli/src/search/new/matches/mod.rs
+++ b/milli/src/search/new/matches/mod.rs
@ -1,7 +1,8 @@
 use std::borrow::Cow;

 use charabia::{SeparatorKind, Token, Tokenizer};
-use matching_words::{MatchType, MatchingWords, PartialMatch, WordId};
+pub use matching_words::MatchingWords;
+use matching_words::{MatchType, PartialMatch, WordId};
 use serde::Serialize;

 use super::query_term::LocatedQueryTerm;
@ -23,12 +24,7 @@ pub struct MatcherBuilder<'a, A> {
 }

 impl<'a, A> MatcherBuilder<'a, A> {
-    pub fn new(
-        ctx: SearchContext,
-        located_terms: Vec<LocatedQueryTerm>,
-        tokenizer: Tokenizer<'a, 'a, A>,
-    ) -> Self {
-        let matching_words = MatchingWords::new(ctx, located_terms);
+    pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self {
        Self {
            matching_words,
            tokenizer,
@ -514,7 +510,8 @@ mod tests {
            let tokenizer = TokenizerBuilder::new().build();
            let tokens = tokenizer.tokenize(query);
            let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap();
-            Self::new(ctx, query_terms, TokenizerBuilder::new().build())
+            let matching_words = MatchingWords::new(ctx, query_terms);
+            Self::new(matching_words, TokenizerBuilder::new().build())
        }
    }

--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -4,7 +4,7 @@ mod graph_based_ranking_rule;
 mod interner;
 mod limits;
 mod logger;
-mod matches;
+pub mod matches;
 mod query_graph;
 mod query_term;
 mod ranking_rule_graph;
@ -271,7 +271,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>(

 #[allow(clippy::too_many_arguments)]
 pub fn execute_search(
-    ctx: &mut SearchContext,
+    mut ctx: SearchContext,
    query: &Option<String>,
    terms_matching_strategy: TermsMatchingStrategy,
    exhaustive_number_hits: bool,
@ -284,21 +284,22 @@ pub fn execute_search(
    query_graph_logger: &mut dyn SearchLogger<QueryGraph>,
 ) -> Result<SearchResult> {
    let mut universe = if let Some(filters) = filters {
-        filters.evaluate(ctx.txn, ctx.index)?
+        filters.evaluate(&mut ctx.txn, &mut ctx.index)?
    } else {
-        ctx.index.documents_ids(ctx.txn)?
+        ctx.index.documents_ids(&mut ctx.txn)?
    };

+    let mut located_query_terms = None;
    let documents_ids = if let Some(query) = query {
        // We make sure that the analyzer is aware of the stop words
        // this ensures that the query builder is able to properly remove them.
        let mut tokbuilder = TokenizerBuilder::new();
-        let stop_words = ctx.index.stop_words(ctx.txn)?;
+        let stop_words = &mut ctx.index.stop_words(&mut ctx.txn)?;
        if let Some(ref stop_words) = stop_words {
            tokbuilder.stop_words(stop_words);
        }

-        let script_lang_map = ctx.index.script_language(ctx.txn)?;
+        let script_lang_map = &mut ctx.index.script_language(&mut ctx.txn)?;
        if !script_lang_map.is_empty() {
            tokbuilder.allow_list(&script_lang_map);
        }
@ -306,27 +307,31 @@ pub fn execute_search(
        let tokenizer = tokbuilder.build();
        let tokens = tokenizer.tokenize(query);

-        let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
-        let graph = QueryGraph::from_query(ctx, &query_terms)?;
+        let query_terms = located_query_terms_from_string(&mut ctx, tokens, words_limit)?;
+        let graph = QueryGraph::from_query(&mut ctx, &query_terms)?;
+        located_query_terms = Some(query_terms);

-        check_sort_criteria(ctx, sort_criteria.as_ref())?;
+        check_sort_criteria(&mut ctx, sort_criteria.as_ref())?;

        universe = resolve_maximally_reduced_query_graph(
-            ctx,
+            &mut ctx,
            &universe,
            &graph,
            terms_matching_strategy,
            query_graph_logger,
        )?;

-        let ranking_rules =
-            get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?;
+        let ranking_rules = get_ranking_rules_for_query_graph_search(
+            &mut ctx,
+            sort_criteria,
+            terms_matching_strategy,
+        )?;

-        bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
+        bucket_sort(&mut ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)?
    } else {
-        let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?;
+        let ranking_rules = get_ranking_rules_for_placeholder_search(&mut ctx, sort_criteria)?;
        bucket_sort(
-            ctx,
+            &mut ctx,
            ranking_rules,
            &PlaceholderQuery,
            &universe,
@ -340,19 +345,20 @@ pub fn execute_search(
    // is requested and a distinct attribute is set.
    let mut candidates = universe;
    if exhaustive_number_hits {
-        if let Some(f) = ctx.index.distinct_field(ctx.txn)? {
-            if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) {
-                candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining;
+        if let Some(f) = &mut ctx.index.distinct_field(&mut ctx.txn)? {
+            if let Some(distinct_fid) = ctx.index.fields_ids_map(&mut ctx.txn)?.id(f) {
+                candidates = apply_distinct_rule(&mut ctx, distinct_fid, &candidates)?.remaining;
            }
        }
    }

-    Ok(SearchResult {
-        // TODO: correct matching words
-        matching_words: MatchingWords::default(),
-        candidates,
-        documents_ids,
-    })
+    // consume context and located_query_terms to build MatchingWords.
+    let matching_words = match located_query_terms {
+        Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms),
+        None => MatchingWords::default(),
+    };
+
+    Ok(SearchResult { matching_words, candidates, documents_ids })
 }

 fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec<AscDesc>>) -> Result<()> {