From e4a3e603b36808e10ad732ed1c0ae2acf9cd49c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Mar 2024 17:31:56 +0100 Subject: [PATCH] Expose a first working version of the negative keyword --- milli/src/search/new/mod.rs | 22 ++++++++++++++++++- .../src/search/new/query_term/parse_query.rs | 21 ++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ad996f363..ec83b84d1 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -209,6 +209,20 @@ fn resolve_universe( ) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] +fn resolve_negative_words( + ctx: &mut SearchContext, + negative_words: &[Word], +) -> Result { + let mut negative_bitmap = RoaringBitmap::new(); + for &word in negative_words { + if let Some(bitmap) = ctx.word_docids(word)? { + negative_bitmap |= bitmap; + } + } + Ok(negative_bitmap) +} + /// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, @@ -620,7 +634,12 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); drop(entered); - let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?; + let (query_terms, negative_words) = + located_query_terms_from_tokens(ctx, tokens, words_limit)?; + + let ignored_documents = resolve_negative_words(ctx, &negative_words)?; + universe -= ignored_documents; + if query_terms.is_empty() { // Do a placeholder search instead None @@ -630,6 +649,7 @@ pub fn execute_search( } else { None }; + let bucket_sort_output = if let Some(query_terms) = query_terms { let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(new_located_query_terms); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index ea997a41a..b23cb2426 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -6,6 +6,7 @@ use charabia::{SeparatorKind, TokenKind}; use super::compute_derivations::partially_initialized_term_from_word; use super::{LocatedQueryTerm, ZeroTypoTerm}; use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; +use crate::search::new::Word; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. @@ -14,12 +15,14 @@ pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, words_limit: Option, -) -> Result> { +) -> Result<(Vec, Vec)> { let nbr_typos = number_of_typos_allowed(ctx)?; let mut located_terms = Vec::new(); let mut phrase: Option = None; + let mut negative_next_token = false; + let mut negative_words = Vec::new(); let parts_limit = words_limit.unwrap_or(usize::MAX); @@ -33,7 +36,7 @@ pub fn located_query_terms_from_tokens( } // early return if word limit is exceeded if located_terms.len() >= parts_limit { - return Ok(located_terms); + return Ok((located_terms, negative_words)); } match token.kind { @@ -46,6 +49,11 @@ pub fn located_query_terms_from_tokens( // 3. if the word is the last token of the query we push it as a prefix word. if let Some(phrase) = &mut phrase { phrase.push_word(ctx, &token, position) + } else if negative_next_token { + let word = token.lemma().to_string(); + let word = Word::Original(ctx.word_interner.insert(word)); + negative_words.push(word); + negative_next_token = false; } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { @@ -63,7 +71,7 @@ pub fn located_query_terms_from_tokens( }; located_terms.push(located_term); } - TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (), } } else { let word = token.lemma(); @@ -122,6 +130,10 @@ pub fn located_query_terms_from_tokens( // Start new phrase if the token ends with an opening quote (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) }; + + if phrase.is_none() && token.lemma() == "-" { + negative_next_token = true; + } } _ => (), } @@ -134,7 +146,7 @@ pub fn located_query_terms_from_tokens( } } - Ok(located_terms) + Ok((located_terms, negative_words)) } pub fn number_of_typos_allowed<'ctx>( @@ -317,6 +329,7 @@ mod tests { // panics with `attempt to add with overflow` before let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?; assert!(located_query_terms.is_empty()); + Ok(()) } }