From 877f4b1045ae0d0eb9c72a5c36c0d939a736d906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Mar 2024 15:51:43 +0100 Subject: [PATCH] Support negative phrases --- .../src/search/new/matches/matching_words.rs | 4 +- milli/src/search/new/mod.rs | 26 +++++++- milli/src/search/new/query_term/mod.rs | 9 ++- .../src/search/new/query_term/parse_query.rs | 64 +++++++++++++++---- 4 files changed, 85 insertions(+), 18 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index acc0de6b0..56bf6c169 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -240,6 +240,7 @@ pub(crate) mod tests { use super::super::super::located_query_terms_from_tokens; use super::*; use crate::index::tests::TempIndex; + use crate::search::new::query_term::ExtractedTokens; pub(crate) fn temp_index_with_documents() -> TempIndex { let temp_index = TempIndex::new(); @@ -261,7 +262,8 @@ pub(crate) mod tests { let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); - let (query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); + let ExtractedTokens { query_terms, .. } = + located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); let matching_words = MatchingWords::new(ctx, query_terms); assert_eq!( diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 99f4562d6..1f0ae7b29 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -33,7 +33,9 @@ use interner::{DedupInterner, Interner}; pub use logger::visual::VisualSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; -use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm}; +use query_term::{ + located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm, +}; use ranking_rules::{ BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait, }; @@ -223,6 +225,21 @@ fn resolve_negative_words( Ok(negative_bitmap) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] +fn resolve_negative_phrases( + ctx: &mut SearchContext, + negative_phrases: &[LocatedQueryTerm], +) -> Result { + let mut negative_bitmap = RoaringBitmap::new(); + for term in negative_phrases { + let query_term = ctx.term_interner.get(term.value); + if let Some(phrase) = query_term.original_phrase() { + negative_bitmap |= ctx.get_phrase_docids(phrase)?; + } + } + Ok(negative_bitmap) +} + /// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, @@ -636,12 +653,15 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); drop(entered); - let (query_terms, negative_words) = + let ExtractedTokens { query_terms, negative_words, negative_phrases } = located_query_terms_from_tokens(ctx, tokens, words_limit)?; - used_negative_operator = !negative_words.is_empty(); + used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty(); let ignored_documents = resolve_negative_words(ctx, &negative_words)?; + let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?; + universe -= ignored_documents; + universe -= ignored_phrases; if query_terms.is_empty() { // Do a placeholder search instead diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index a37e60ed0..70f1d3c4f 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -9,7 +9,9 @@ use std::ops::RangeInclusive; use either::Either; pub use ntypo_subset::NTypoTermSubset; -pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed}; +pub use parse_query::{ + located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens, +}; pub use phrase::Phrase; use super::interner::{DedupInterner, Interned}; @@ -478,6 +480,11 @@ impl QueryTerm { pub fn original_word(&self, ctx: &SearchContext) -> String { ctx.word_interner.get(self.original).clone() } + + pub fn original_phrase(&self) -> Option> { + self.zero_typo.phrase + } + pub fn all_computed_derivations(&self) -> (Vec>, Vec>) { let mut words = BTreeSet::new(); let mut phrases = BTreeSet::new(); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index bdb937384..86be7da77 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -9,21 +9,34 @@ use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; use crate::search::new::Word; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; +#[derive(Clone)] +/// Extraction of the content of a query. +pub struct ExtractedTokens { + /// The terms to search for in the database. + pub query_terms: Vec, + /// The words that must not appear in the results. + pub negative_words: Vec, + /// The phrases that must not appear in the results. + pub negative_phrases: Vec, +} + /// Convert the tokenised search query into a list of located query terms. #[tracing::instrument(level = "trace", skip_all, target = "search::query")] pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, words_limit: Option, -) -> Result<(Vec, Vec)> { +) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; - let mut located_terms = Vec::new(); + let mut query_terms = Vec::new(); + let mut negative_phrase = false; let mut phrase: Option = None; let mut encountered_whitespace = true; let mut negative_next_token = false; let mut negative_words = Vec::new(); + let mut negative_phrases = Vec::new(); let parts_limit = words_limit.unwrap_or(usize::MAX); @@ -37,8 +50,8 @@ pub fn located_query_terms_from_tokens( } // early return if word limit is exceeded - if located_terms.len() >= parts_limit { - return Ok((located_terms, negative_words)); + if query_terms.len() >= parts_limit { + return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases }); } match token.kind { @@ -71,7 +84,7 @@ pub fn located_query_terms_from_tokens( value: ctx.term_interner.push(term), positions: position..=position, }; - located_terms.push(located_term); + query_terms.push(located_term); } TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (), } @@ -88,7 +101,7 @@ pub fn located_query_terms_from_tokens( value: ctx.term_interner.push(term), positions: position..=position, }; - located_terms.push(located_term); + query_terms.push(located_term); } } TokenKind::Separator(separator_kind) => { @@ -104,7 +117,14 @@ pub fn located_query_terms_from_tokens( let phrase = if separator_kind == SeparatorKind::Hard { if let Some(phrase) = phrase { if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) + // as we are evaluating a negative operator we put the phrase + // in the negative one *but* we don't reset the negative operator + // as we are immediatly starting a new negative phrase. + if negative_phrase { + negative_phrases.push(located_query_term); + } else { + query_terms.push(located_query_term); + } } Some(PhraseBuilder::empty()) } else { @@ -125,12 +145,24 @@ pub fn located_query_terms_from_tokens( // Per the check above, quote_count > 0 quote_count -= 1; if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) + // we were evaluating a negative operator so we + // put the phrase in the negative phrases + if negative_phrase { + negative_phrases.push(located_query_term); + negative_phrase = false; + } else { + query_terms.push(located_query_term); + } } } // Start new phrase if the token ends with an opening quote - (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + if quote_count % 2 == 1 { + negative_phrase = negative_next_token; + Some(PhraseBuilder::empty()) + } else { + None + } }; negative_next_token = @@ -146,11 +178,16 @@ pub fn located_query_terms_from_tokens( // If a quote is never closed, we consider all of the end of the query as a phrase. if let Some(phrase) = phrase.take() { if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term); + // put the phrase in the negative set if we are evaluating a negative operator. + if negative_phrase { + negative_phrases.push(located_query_term); + } else { + query_terms.push(located_query_term); + } } } - Ok((located_terms, negative_words)) + Ok(ExtractedTokens { query_terms, negative_words, negative_phrases }) } pub fn number_of_typos_allowed<'ctx>( @@ -331,8 +368,9 @@ mod tests { let rtxn = index.read_txn()?; let mut ctx = SearchContext::new(&index, &rtxn); // panics with `attempt to add with overflow` before - let (located_query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None)?; - assert!(located_query_terms.is_empty()); + let ExtractedTokens { query_terms, .. } = + located_query_terms_from_tokens(&mut ctx, tokens, None)?; + assert!(query_terms.is_empty()); Ok(()) }