From 7ab48ed8c772c8ff49eb0cb95759fc69de53d925 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 11 Apr 2023 15:41:44 +0200 Subject: [PATCH] Matching words fixes --- milli/src/search/new/logger/mod.rs | 2 - .../src/search/new/matches/matching_words.rs | 64 +------------ milli/src/search/new/matches/mod.rs | 4 +- milli/src/search/new/mod.rs | 17 ++-- milli/src/search/new/query_term/mod.rs | 94 +++++++++++++++---- 5 files changed, 94 insertions(+), 87 deletions(-) diff --git a/milli/src/search/new/logger/mod.rs b/milli/src/search/new/logger/mod.rs index 15cb78784..889e811ad 100644 --- a/milli/src/search/new/logger/mod.rs +++ b/milli/src/search/new/logger/mod.rs @@ -1,8 +1,6 @@ // #[cfg(test)] pub mod detailed; -pub mod test_logger; - use roaring::RoaringBitmap; use super::interner::{Interned, MappedInterner}; diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index d5d1b6906..0da1b3a78 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -5,9 +5,7 @@ use std::ops::RangeInclusive; use charabia::Token; use super::super::interner::Interned; -use super::super::query_term::{ - Lazy, LocatedQueryTerm, OneTypoTerm, QueryTerm, TwoTypoTerm, ZeroTypoTerm, -}; +use super::super::query_term::LocatedQueryTerm; use super::super::{DedupInterner, Phrase}; use crate::SearchContext; @@ -33,68 +31,16 @@ pub struct MatchingWords { words: Vec, } -/// Extract and centralize the different phrases and words to match stored in a QueryTerm. -fn extract_matching_terms(term: &QueryTerm) -> (Vec>, Vec>) { - let mut matching_words = Vec::new(); - let mut matching_phrases = Vec::new(); - - // the structure is exhaustively extracted to ensure that no field is missing. - let QueryTerm { - original: _, - is_multiple_words: _, - max_nbr_typos: _, - is_prefix: _, - zero_typo, - one_typo, - two_typo, - } = term; - - // the structure is exhaustively extracted to ensure that no field is missing. - let ZeroTypoTerm { phrase, zero_typo, prefix_of: _, synonyms, use_prefix_db: _ } = zero_typo; - - // zero typo - if let Some(phrase) = phrase { - matching_phrases.push(*phrase); - } - if let Some(zero_typo) = zero_typo { - matching_words.push(*zero_typo); - } - for synonym in synonyms { - matching_phrases.push(*synonym); - } - - // one typo - // the structure is exhaustively extracted to ensure that no field is missing. - if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = one_typo { - if let Some(split_words) = split_words { - matching_phrases.push(*split_words); - } - for one_typo in one_typo { - matching_words.push(*one_typo); - } - } - - // two typos - // the structure is exhaustively extracted to ensure that no field is missing. - if let Lazy::Init(TwoTypoTerm { two_typos }) = two_typo { - for two_typos in two_typos { - matching_words.push(*two_typos); - } - } - - (matching_phrases, matching_words) -} - impl MatchingWords { pub fn new(ctx: SearchContext, located_terms: Vec) -> Self { let mut phrases = Vec::new(); let mut words = Vec::new(); - // Extract and centralize the different phrases and words to match stored in a QueryTerm using extract_matching_terms + // Extract and centralize the different phrases and words to match stored in a QueryTerm // and wrap them in dedicated structures. for located_term in located_terms { let term = ctx.term_interner.get(located_term.value); - let (matching_phrases, matching_words) = extract_matching_terms(term); + let (matching_words, matching_phrases) = term.all_computed_derivations(); for matching_phrase in matching_phrases { phrases.push(LocatedMatchingPhrase { @@ -106,8 +52,8 @@ impl MatchingWords { words.push(LocatedMatchingWords { value: matching_words, positions: located_term.positions.clone(), - is_prefix: term.is_prefix, - original_char_count: ctx.word_interner.get(term.original).chars().count(), + is_prefix: term.is_cached_prefix(), + original_char_count: term.original_word(&ctx).chars().count(), }); } diff --git a/milli/src/search/new/matches/mod.rs b/milli/src/search/new/matches/mod.rs index 8dded0cab..84bdea7ab 100644 --- a/milli/src/search/new/matches/mod.rs +++ b/milli/src/search/new/matches/mod.rs @@ -137,7 +137,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { } // partial match is now full, we keep this matches and we advance positions Some(MatchType::Full { char_len, ids }) => { - let ids: Vec<_> = ids.clone().into_iter().collect(); + let ids: Vec<_> = ids.clone().collect(); // save previously matched tokens as matches. let iter = potential_matches.into_iter().map( |(token_position, word_position, match_len)| Match { @@ -192,7 +192,7 @@ impl<'t, A: AsRef<[u8]>> Matcher<'t, '_, A> { // we match, we save the current token as a match, // then we continue the rest of the tokens. MatchType::Full { char_len, ids } => { - let ids: Vec<_> = ids.clone().into_iter().collect(); + let ids: Vec<_> = ids.clone().collect(); matches.push(Match { match_len: char_len, ids, diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 0445b3e94..e07a17029 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -35,20 +35,20 @@ pub use logger::detailed::DetailedSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; use query_term::{located_query_terms_from_string, LocatedQueryTerm, Phrase, QueryTerm}; -use ranking_rules::{bucket_sort, PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; +use ranking_rules::{PlaceholderQuery, RankingRuleOutput, RankingRuleQueryTrait}; use resolve_query_graph::PhraseDocIdsCache; use roaring::RoaringBitmap; use words::Words; -use self::bucket_sort::BucketSortOutput; -use self::exact_attribute::ExactAttribute; -use self::graph_based_ranking_rule::Exactness; -use self::interner::Interner; -use self::ranking_rules::{BoxRankingRule, RankingRule}; -use self::resolve_query_graph::compute_query_graph_docids; -use self::sort::Sort; use crate::search::new::distinct::apply_distinct_rule; use crate::{AscDesc, DocumentId, Filter, Index, Member, Result, TermsMatchingStrategy, UserError}; +use bucket_sort::BucketSortOutput; +use exact_attribute::ExactAttribute; +use graph_based_ranking_rule::Exactness; +use interner::Interner; +use ranking_rules::{BoxRankingRule, RankingRule}; +use resolve_query_graph::compute_query_graph_docids; +use sort::Sort; /// A structure used throughout the execution of a search query. pub struct SearchContext<'ctx> { @@ -361,6 +361,7 @@ pub fn execute_search( Ok(PartialSearchResult { candidates: all_candidates, documents_ids: docids, + located_query_terms, }) } diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 896c70e1b..83320139b 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -188,17 +188,35 @@ impl QueryTermSubset { } let original = ctx.term_interner.get_mut(self.original); - if !self.zero_typo_subset.is_empty() { - let ZeroTypoTerm { - phrase: _, - exact: zero_typo, - prefix_of, - synonyms: _, - use_prefix_db: _, - } = &original.zero_typo; - result.extend(zero_typo.iter().copied()); - result.extend(prefix_of.iter().copied()); - }; + match &self.zero_typo_subset { + NTypoTermSubset::All => { + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; + result.extend(zero_typo.iter().copied()); + result.extend(prefix_of.iter().copied()); + } + NTypoTermSubset::Subset { words, phrases: _ } => { + let ZeroTypoTerm { + phrase: _, + exact: zero_typo, + prefix_of, + synonyms: _, + use_prefix_db: _, + } = &original.zero_typo; + if let Some(zero_typo) = zero_typo { + if words.contains(zero_typo) { + result.insert(*zero_typo); + } + } + result.extend(prefix_of.intersection(words).copied()); + } + NTypoTermSubset::Nothing => {} + } match &self.one_typo_subset { NTypoTermSubset::All => { @@ -248,11 +266,24 @@ impl QueryTermSubset { result.extend(phrase.iter().copied()); result.extend(synonyms.iter().copied()); - if !self.one_typo_subset.is_empty() { - let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { - panic!(); - }; - result.extend(split_words.iter().copied()); + match &self.one_typo_subset { + NTypoTermSubset::All => { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + result.extend(split_words.iter().copied()); + } + NTypoTermSubset::Subset { phrases, .. } => { + let Lazy::Init(OneTypoTerm { split_words, one_typo: _ }) = &original.one_typo else { + panic!(); + }; + if let Some(split_words) = split_words { + if phrases.contains(split_words) { + result.insert(*split_words); + } + } + } + NTypoTermSubset::Nothing => {} } Ok(result) @@ -368,3 +399,34 @@ impl LocatedQueryTerm { interner.get(self.value).is_empty() } } + +impl QueryTerm { + pub fn is_cached_prefix(&self) -> bool { + self.zero_typo.use_prefix_db.is_some() + } + pub fn original_word(&self, ctx: &SearchContext) -> String { + ctx.word_interner.get(self.original).clone() + } + pub fn all_computed_derivations(&self) -> (Vec>, Vec>) { + let mut words = BTreeSet::new(); + let mut phrases = BTreeSet::new(); + + let ZeroTypoTerm { phrase, exact: zero_typo, prefix_of, synonyms, use_prefix_db: _ } = + &self.zero_typo; + words.extend(zero_typo.iter().copied()); + words.extend(prefix_of.iter().copied()); + phrases.extend(phrase.iter().copied()); + phrases.extend(synonyms.iter().copied()); + + if let Lazy::Init(OneTypoTerm { split_words, one_typo }) = &self.one_typo { + words.extend(one_typo.iter().copied()); + phrases.extend(split_words.iter().copied()); + }; + + if let Lazy::Init(TwoTypoTerm { two_typos }) = &self.two_typo { + words.extend(two_typos.iter().copied()); + }; + + (words.into_iter().collect(), phrases.into_iter().collect()) + } +}