diff --git a/milli/src/lib.rs b/milli/src/lib.rs index eb63c3904..13e23a5bd 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -97,8 +97,8 @@ pub use self::heed_codec::{ }; pub use self::index::Index; pub use self::search::{ - FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWord, - MatchingWords, Search, SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, + FacetDistribution, Filter, FormatOptions, MatchBounds, MatcherBuilder, MatchingWords, Search, + SearchResult, TermsMatchingStrategy, DEFAULT_VALUES_PER_FACET, }; pub type Result = std::result::Result; diff --git a/milli/src/search/matches/matching_words.rs b/milli/src/search/matches/matching_words.rs index 22241c457..5ccf0286f 100644 --- a/milli/src/search/matches/matching_words.rs +++ b/milli/src/search/matches/matching_words.rs @@ -289,8 +289,7 @@ mod tests { use charabia::TokenKind; - use super::*; - use crate::MatchingWords; + use super::{MatchingWords, *}; #[test] fn test_bytes_to_highlight() { diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 08803b73f..3683a5cf0 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -5,9 +5,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub use self::facet::{FacetDistribution, Filter, DEFAULT_VALUES_PER_FACET}; -pub use self::matches::{ - FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWord, MatchingWords, -}; +pub use self::new::matches::{FormatOptions, MatchBounds, Matcher, MatcherBuilder, MatchingWords}; use crate::{ execute_search, AscDesc, DefaultSearchLogger, DocumentId, Index, Result, SearchContext, }; @@ -109,9 +107,9 @@ impl<'a> Search<'a> { } pub fn execute(&self) -> Result { - let mut ctx = SearchContext::new(self.index, self.rtxn); + let ctx = SearchContext::new(self.index, self.rtxn); execute_search( - &mut ctx, + ctx, &self.query, self.terms_matching_strategy, self.exhaustive_number_hits, diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index e737dc942..4ca04884a 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -1,4 +1,5 @@ use std::cmp::Reverse; +use std::fmt; use std::ops::RangeInclusive; use charabia::Token; @@ -23,6 +24,7 @@ pub struct LocatedMatchingWords { /// Structure created from a query tree /// referencing words that match the given query tree. +#[derive(Default)] pub struct MatchingWords { word_interner: DedupInterner, phrase_interner: DedupInterner, @@ -240,6 +242,40 @@ impl<'a> PartialMatch<'a> { } } +impl fmt::Debug for MatchingWords { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + let MatchingWords { word_interner, phrase_interner, phrases, words } = self; + + let phrases: Vec<_> = phrases + .iter() + .map(|p| { + ( + phrase_interner + .get(p.value) + .words + .iter() + .map(|w| w.map_or("STOP_WORD", |w| word_interner.get(w))) + .collect::>() + .join(" "), + p.positions.clone(), + ) + }) + .collect(); + + let words: Vec<_> = words + .iter() + .flat_map(|w| { + w.value + .iter() + .map(|s| (word_interner.get(*s), w.positions.clone(), w.is_prefix)) + .collect::>() + }) + .collect(); + + f.debug_struct("MatchingWords").field("phrases", &phrases).field("words", &words).finish() + } +} + #[cfg(test)] pub(crate) mod tests { use std::borrow::Cow; diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 9b73c2098..2a9596902 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -1,7 +1,8 @@ use std::borrow::Cow; use charabia::{SeparatorKind, Token, Tokenizer}; -use matching_words::{MatchType, MatchingWords, PartialMatch, WordId}; +pub use matching_words::MatchingWords; +use matching_words::{MatchType, PartialMatch, WordId}; use serde::Serialize; use super::query_term::LocatedQueryTerm; @@ -23,12 +24,7 @@ pub struct MatcherBuilder<'a, A> { } impl<'a, A> MatcherBuilder<'a, A> { - pub fn new( - ctx: SearchContext, - located_terms: Vec, - tokenizer: Tokenizer<'a, 'a, A>, - ) -> Self { - let matching_words = MatchingWords::new(ctx, located_terms); + pub fn new(matching_words: MatchingWords, tokenizer: Tokenizer<'a, 'a, A>) -> Self { Self { matching_words, tokenizer, @@ -514,7 +510,8 @@ mod tests { let tokenizer = TokenizerBuilder::new().build(); let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(&mut ctx, tokens, None).unwrap(); - Self::new(ctx, query_terms, TokenizerBuilder::new().build()) + let matching_words = MatchingWords::new(ctx, query_terms); + Self::new(matching_words, TokenizerBuilder::new().build()) } } diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ef7e61ee1..0bb454c06 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -4,7 +4,7 @@ mod graph_based_ranking_rule; mod interner; mod limits; mod logger; -mod matches; +pub mod matches; mod query_graph; mod query_term; mod ranking_rule_graph; @@ -271,7 +271,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( #[allow(clippy::too_many_arguments)] pub fn execute_search( - ctx: &mut SearchContext, + mut ctx: SearchContext, query: &Option, terms_matching_strategy: TermsMatchingStrategy, exhaustive_number_hits: bool, @@ -284,21 +284,22 @@ pub fn execute_search( query_graph_logger: &mut dyn SearchLogger, ) -> Result { let mut universe = if let Some(filters) = filters { - filters.evaluate(ctx.txn, ctx.index)? + filters.evaluate(&mut ctx.txn, &mut ctx.index)? } else { - ctx.index.documents_ids(ctx.txn)? + ctx.index.documents_ids(&mut ctx.txn)? }; + let mut located_query_terms = None; let documents_ids = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); - let stop_words = ctx.index.stop_words(ctx.txn)?; + let stop_words = &mut ctx.index.stop_words(&mut ctx.txn)?; if let Some(ref stop_words) = stop_words { tokbuilder.stop_words(stop_words); } - let script_lang_map = ctx.index.script_language(ctx.txn)?; + let script_lang_map = &mut ctx.index.script_language(&mut ctx.txn)?; if !script_lang_map.is_empty() { tokbuilder.allow_list(&script_lang_map); } @@ -306,27 +307,31 @@ pub fn execute_search( let tokenizer = tokbuilder.build(); let tokens = tokenizer.tokenize(query); - let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; - let graph = QueryGraph::from_query(ctx, &query_terms)?; + let query_terms = located_query_terms_from_string(&mut ctx, tokens, words_limit)?; + let graph = QueryGraph::from_query(&mut ctx, &query_terms)?; + located_query_terms = Some(query_terms); - check_sort_criteria(ctx, sort_criteria.as_ref())?; + check_sort_criteria(&mut ctx, sort_criteria.as_ref())?; universe = resolve_maximally_reduced_query_graph( - ctx, + &mut ctx, &universe, &graph, terms_matching_strategy, query_graph_logger, )?; - let ranking_rules = - get_ranking_rules_for_query_graph_search(ctx, sort_criteria, terms_matching_strategy)?; + let ranking_rules = get_ranking_rules_for_query_graph_search( + &mut ctx, + sort_criteria, + terms_matching_strategy, + )?; - bucket_sort(ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? + bucket_sort(&mut ctx, ranking_rules, &graph, &universe, from, length, query_graph_logger)? } else { - let ranking_rules = get_ranking_rules_for_placeholder_search(ctx, sort_criteria)?; + let ranking_rules = get_ranking_rules_for_placeholder_search(&mut ctx, sort_criteria)?; bucket_sort( - ctx, + &mut ctx, ranking_rules, &PlaceholderQuery, &universe, @@ -340,19 +345,20 @@ pub fn execute_search( // is requested and a distinct attribute is set. let mut candidates = universe; if exhaustive_number_hits { - if let Some(f) = ctx.index.distinct_field(ctx.txn)? { - if let Some(distinct_fid) = ctx.index.fields_ids_map(ctx.txn)?.id(f) { - candidates = apply_distinct_rule(ctx, distinct_fid, &candidates)?.remaining; + if let Some(f) = &mut ctx.index.distinct_field(&mut ctx.txn)? { + if let Some(distinct_fid) = ctx.index.fields_ids_map(&mut ctx.txn)?.id(f) { + candidates = apply_distinct_rule(&mut ctx, distinct_fid, &candidates)?.remaining; } } } - Ok(SearchResult { - // TODO: correct matching words - matching_words: MatchingWords::default(), - candidates, - documents_ids, - }) + // consume context and located_query_terms to build MatchingWords. + let matching_words = match located_query_terms { + Some(located_query_terms) => MatchingWords::new(ctx, located_query_terms), + None => MatchingWords::default(), + }; + + Ok(SearchResult { matching_words, candidates, documents_ids }) } fn check_sort_criteria(ctx: &SearchContext, sort_criteria: Option<&Vec>) -> Result<()> {