From e4a3e603b36808e10ad732ed1c0ae2acf9cd49c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Mar 2024 17:31:56 +0100 Subject: [PATCH 1/6] Expose a first working version of the negative keyword --- milli/src/search/new/mod.rs | 22 ++++++++++++++++++- .../src/search/new/query_term/parse_query.rs | 21 ++++++++++++++---- 2 files changed, 38 insertions(+), 5 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ad996f363..ec83b84d1 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -209,6 +209,20 @@ fn resolve_universe( ) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] +fn resolve_negative_words( + ctx: &mut SearchContext, + negative_words: &[Word], +) -> Result { + let mut negative_bitmap = RoaringBitmap::new(); + for &word in negative_words { + if let Some(bitmap) = ctx.word_docids(word)? { + negative_bitmap |= bitmap; + } + } + Ok(negative_bitmap) +} + /// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, @@ -620,7 +634,12 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); drop(entered); - let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?; + let (query_terms, negative_words) = + located_query_terms_from_tokens(ctx, tokens, words_limit)?; + + let ignored_documents = resolve_negative_words(ctx, &negative_words)?; + universe -= ignored_documents; + if query_terms.is_empty() { // Do a placeholder search instead None @@ -630,6 +649,7 @@ pub fn execute_search( } else { None }; + let bucket_sort_output = if let Some(query_terms) = query_terms { let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(new_located_query_terms); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index ea997a41a..b23cb2426 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -6,6 +6,7 @@ use charabia::{SeparatorKind, TokenKind}; use super::compute_derivations::partially_initialized_term_from_word; use super::{LocatedQueryTerm, ZeroTypoTerm}; use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; +use crate::search::new::Word; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; /// Convert the tokenised search query into a list of located query terms. @@ -14,12 +15,14 @@ pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, words_limit: Option, -) -> Result> { +) -> Result<(Vec, Vec)> { let nbr_typos = number_of_typos_allowed(ctx)?; let mut located_terms = Vec::new(); let mut phrase: Option = None; + let mut negative_next_token = false; + let mut negative_words = Vec::new(); let parts_limit = words_limit.unwrap_or(usize::MAX); @@ -33,7 +36,7 @@ pub fn located_query_terms_from_tokens( } // early return if word limit is exceeded if located_terms.len() >= parts_limit { - return Ok(located_terms); + return Ok((located_terms, negative_words)); } match token.kind { @@ -46,6 +49,11 @@ pub fn located_query_terms_from_tokens( // 3. if the word is the last token of the query we push it as a prefix word. if let Some(phrase) = &mut phrase { phrase.push_word(ctx, &token, position) + } else if negative_next_token { + let word = token.lemma().to_string(); + let word = Word::Original(ctx.word_interner.insert(word)); + negative_words.push(word); + negative_next_token = false; } else if peekable.peek().is_some() { match token.kind { TokenKind::Word => { @@ -63,7 +71,7 @@ pub fn located_query_terms_from_tokens( }; located_terms.push(located_term); } - TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {} + TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (), } } else { let word = token.lemma(); @@ -122,6 +130,10 @@ pub fn located_query_terms_from_tokens( // Start new phrase if the token ends with an opening quote (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) }; + + if phrase.is_none() && token.lemma() == "-" { + negative_next_token = true; + } } _ => (), } @@ -134,7 +146,7 @@ pub fn located_query_terms_from_tokens( } } - Ok(located_terms) + Ok((located_terms, negative_words)) } pub fn number_of_typos_allowed<'ctx>( @@ -317,6 +329,7 @@ mod tests { // panics with `attempt to add with overflow` before let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?; assert!(located_query_terms.is_empty()); + Ok(()) } } From 1da9e0f246c9dc0b7f12cc745dfb9711a9a039bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Mar 2024 17:42:09 +0100 Subject: [PATCH 2/6] Better support space around the negative operator (-) --- milli/src/search/new/query_term/parse_query.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index b23cb2426..e510595ee 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -21,6 +21,7 @@ pub fn located_query_terms_from_tokens( let mut located_terms = Vec::new(); let mut phrase: Option = None; + let mut encountered_whitespace = true; let mut negative_next_token = false; let mut negative_words = Vec::new(); @@ -34,6 +35,7 @@ pub fn located_query_terms_from_tokens( if token.lemma().is_empty() { continue; } + // early return if word limit is exceeded if located_terms.len() >= parts_limit { return Ok((located_terms, negative_words)); @@ -131,12 +133,14 @@ pub fn located_query_terms_from_tokens( (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) }; - if phrase.is_none() && token.lemma() == "-" { - negative_next_token = true; - } + negative_next_token = + phrase.is_none() && token.lemma() == "-" && encountered_whitespace; } _ => (), } + + encountered_whitespace = + token.lemma().chars().last().filter(|c| c.is_whitespace()).is_some(); } // If a quote is never closed, we consider all of the end of the query as a phrase. From 34262c7a0d9baba51e2ebbc6a1b865c5ab5d2d00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Mar 2024 18:01:27 +0100 Subject: [PATCH 3/6] Add analytics for the negative operator --- meilisearch/src/analytics/segment_analytics.rs | 10 ++++++++++ meilisearch/src/search.rs | 6 +++++- milli/src/search/hybrid.rs | 3 +++ milli/src/search/mod.rs | 11 ++++++++++- milli/src/search/new/mod.rs | 5 +++++ 5 files changed, 33 insertions(+), 2 deletions(-) diff --git a/meilisearch/src/analytics/segment_analytics.rs b/meilisearch/src/analytics/segment_analytics.rs index 99298bd43..f20cff5d9 100644 --- a/meilisearch/src/analytics/segment_analytics.rs +++ b/meilisearch/src/analytics/segment_analytics.rs @@ -580,6 +580,7 @@ pub struct SearchAggregator { total_received: usize, total_succeeded: usize, total_degraded: usize, + total_used_negative_operator: usize, time_spent: BinaryHeap, // sort @@ -760,12 +761,16 @@ impl SearchAggregator { facet_distribution: _, facet_stats: _, degraded, + used_negative_operator, } = result; self.total_succeeded = self.total_succeeded.saturating_add(1); if *degraded { self.total_degraded = self.total_degraded.saturating_add(1); } + if *used_negative_operator { + self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1); + } self.time_spent.push(*processing_time_ms as usize); } @@ -808,6 +813,7 @@ impl SearchAggregator { embedder, hybrid, total_degraded, + total_used_negative_operator, } = other; if self.timestamp.is_none() { @@ -823,6 +829,8 @@ impl SearchAggregator { self.total_received = self.total_received.saturating_add(total_received); self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded); self.total_degraded = self.total_degraded.saturating_add(total_degraded); + self.total_used_negative_operator = + self.total_used_negative_operator.saturating_add(total_used_negative_operator); self.time_spent.append(time_spent); // sort @@ -929,6 +937,7 @@ impl SearchAggregator { embedder, hybrid, total_degraded, + total_used_negative_operator, } = self; if total_received == 0 { @@ -949,6 +958,7 @@ impl SearchAggregator { "total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics "total_received": total_received, "total_degraded": total_degraded, + "total_used_negative_operator": total_used_negative_operator, }, "sort": { "with_geoPoint": sort_with_geo_point, diff --git a/meilisearch/src/search.rs b/meilisearch/src/search.rs index 3c00ca802..db58c6102 100644 --- a/meilisearch/src/search.rs +++ b/meilisearch/src/search.rs @@ -324,9 +324,11 @@ pub struct SearchResult { #[serde(skip_serializing_if = "Option::is_none")] pub facet_stats: Option>, - // This information is only used for analytics purposes + // These fields are only used for analytics purposes #[serde(skip)] pub degraded: bool, + #[serde(skip)] + pub used_negative_operator: bool, } #[derive(Serialize, Debug, Clone, PartialEq)] @@ -512,6 +514,7 @@ pub fn perform_search( candidates, document_scores, degraded, + used_negative_operator, .. } = match &query.hybrid { Some(hybrid) => match *hybrid.semantic_ratio { @@ -717,6 +720,7 @@ pub fn perform_search( facet_distribution, facet_stats, degraded, + used_negative_operator, }; Ok(result) } diff --git a/milli/src/search/hybrid.rs b/milli/src/search/hybrid.rs index a8b7f0fcf..2a6d9f7a5 100644 --- a/milli/src/search/hybrid.rs +++ b/milli/src/search/hybrid.rs @@ -11,6 +11,7 @@ struct ScoreWithRatioResult { candidates: RoaringBitmap, document_scores: Vec<(u32, ScoreWithRatio)>, degraded: bool, + used_negative_operator: bool, } type ScoreWithRatio = (Vec, f32); @@ -78,6 +79,7 @@ impl ScoreWithRatioResult { candidates: results.candidates, document_scores, degraded: results.degraded, + used_negative_operator: results.used_negative_operator, } } @@ -113,6 +115,7 @@ impl ScoreWithRatioResult { documents_ids, document_scores, degraded: left.degraded | right.degraded, + used_negative_operator: left.used_negative_operator | right.used_negative_operator, } } } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index b3dd0c091..3c709a647 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -183,6 +183,7 @@ impl<'a> Search<'a> { documents_ids, document_scores, degraded, + used_negative_operator, } = match self.vector.as_ref() { Some(vector) => execute_vector_search( &mut ctx, @@ -221,7 +222,14 @@ impl<'a> Search<'a> { None => MatchingWords::default(), }; - Ok(SearchResult { matching_words, candidates, document_scores, documents_ids, degraded }) + Ok(SearchResult { + matching_words, + candidates, + document_scores, + documents_ids, + degraded, + used_negative_operator, + }) } } @@ -272,6 +280,7 @@ pub struct SearchResult { pub documents_ids: Vec, pub document_scores: Vec>, pub degraded: bool, + pub used_negative_operator: bool, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index ec83b84d1..99f4562d6 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -571,6 +571,7 @@ pub fn execute_vector_search( documents_ids: docids, located_query_terms: None, degraded, + used_negative_operator: false, }) } @@ -594,6 +595,7 @@ pub fn execute_search( ) -> Result { check_sort_criteria(ctx, sort_criteria.as_ref())?; + let mut used_negative_operator = false; let mut located_query_terms = None; let query_terms = if let Some(query) = query { let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder"); @@ -636,6 +638,7 @@ pub fn execute_search( let (query_terms, negative_words) = located_query_terms_from_tokens(ctx, tokens, words_limit)?; + used_negative_operator = !negative_words.is_empty(); let ignored_documents = resolve_negative_words(ctx, &negative_words)?; universe -= ignored_documents; @@ -710,6 +713,7 @@ pub fn execute_search( documents_ids: docids, located_query_terms, degraded, + used_negative_operator, }) } @@ -772,4 +776,5 @@ pub struct PartialSearchResult { pub document_scores: Vec>, pub degraded: bool, + pub used_negative_operator: bool, } From 69f8b2730d9128816a18a5c100379f96e815ddc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 26 Mar 2024 18:07:43 +0100 Subject: [PATCH 4/6] Fix the tests --- milli/src/index.rs | 1 + milli/src/search/new/matches/matching_words.rs | 2 +- milli/src/search/new/query_term/parse_query.rs | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/milli/src/index.rs b/milli/src/index.rs index d921de9e4..b2ea105da 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -2435,6 +2435,7 @@ pub(crate) mod tests { document_scores: _, mut documents_ids, degraded: _, + used_negative_operator: _, } = search.execute().unwrap(); let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap(); documents_ids.sort_unstable(); diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index 9e01f6dcf..acc0de6b0 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -261,7 +261,7 @@ pub(crate) mod tests { let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); - let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); + let (query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); let matching_words = MatchingWords::new(ctx, query_terms); assert_eq!( diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index e510595ee..bdb937384 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -331,7 +331,7 @@ mod tests { let rtxn = index.read_txn()?; let mut ctx = SearchContext::new(&index, &rtxn); // panics with `attempt to add with overflow` before - let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?; + let (located_query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None)?; assert!(located_query_terms.is_empty()); Ok(()) From 877f4b1045ae0d0eb9c72a5c36c0d939a736d906 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 28 Mar 2024 15:51:43 +0100 Subject: [PATCH 5/6] Support negative phrases --- .../src/search/new/matches/matching_words.rs | 4 +- milli/src/search/new/mod.rs | 26 +++++++- milli/src/search/new/query_term/mod.rs | 9 ++- .../src/search/new/query_term/parse_query.rs | 64 +++++++++++++++---- 4 files changed, 85 insertions(+), 18 deletions(-) diff --git a/milli/src/search/new/matches/matching_words.rs b/milli/src/search/new/matches/matching_words.rs index acc0de6b0..56bf6c169 100644 --- a/milli/src/search/new/matches/matching_words.rs +++ b/milli/src/search/new/matches/matching_words.rs @@ -240,6 +240,7 @@ pub(crate) mod tests { use super::super::super::located_query_terms_from_tokens; use super::*; use crate::index::tests::TempIndex; + use crate::search::new::query_term::ExtractedTokens; pub(crate) fn temp_index_with_documents() -> TempIndex { let temp_index = TempIndex::new(); @@ -261,7 +262,8 @@ pub(crate) mod tests { let mut builder = TokenizerBuilder::default(); let tokenizer = builder.build(); let tokens = tokenizer.tokenize("split this world"); - let (query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); + let ExtractedTokens { query_terms, .. } = + located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap(); let matching_words = MatchingWords::new(ctx, query_terms); assert_eq!( diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 99f4562d6..1f0ae7b29 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -33,7 +33,9 @@ use interner::{DedupInterner, Interner}; pub use logger::visual::VisualSearchLogger; pub use logger::{DefaultSearchLogger, SearchLogger}; use query_graph::{QueryGraph, QueryNode}; -use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm}; +use query_term::{ + located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm, +}; use ranking_rules::{ BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait, }; @@ -223,6 +225,21 @@ fn resolve_negative_words( Ok(negative_bitmap) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] +fn resolve_negative_phrases( + ctx: &mut SearchContext, + negative_phrases: &[LocatedQueryTerm], +) -> Result { + let mut negative_bitmap = RoaringBitmap::new(); + for term in negative_phrases { + let query_term = ctx.term_interner.get(term.value); + if let Some(phrase) = query_term.original_phrase() { + negative_bitmap |= ctx.get_phrase_docids(phrase)?; + } + } + Ok(negative_bitmap) +} + /// Return the list of initialised ranking rules to be used for a placeholder search. fn get_ranking_rules_for_placeholder_search<'ctx>( ctx: &SearchContext<'ctx>, @@ -636,12 +653,15 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); drop(entered); - let (query_terms, negative_words) = + let ExtractedTokens { query_terms, negative_words, negative_phrases } = located_query_terms_from_tokens(ctx, tokens, words_limit)?; - used_negative_operator = !negative_words.is_empty(); + used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty(); let ignored_documents = resolve_negative_words(ctx, &negative_words)?; + let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?; + universe -= ignored_documents; + universe -= ignored_phrases; if query_terms.is_empty() { // Do a placeholder search instead diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index a37e60ed0..70f1d3c4f 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -9,7 +9,9 @@ use std::ops::RangeInclusive; use either::Either; pub use ntypo_subset::NTypoTermSubset; -pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed}; +pub use parse_query::{ + located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens, +}; pub use phrase::Phrase; use super::interner::{DedupInterner, Interned}; @@ -478,6 +480,11 @@ impl QueryTerm { pub fn original_word(&self, ctx: &SearchContext) -> String { ctx.word_interner.get(self.original).clone() } + + pub fn original_phrase(&self) -> Option> { + self.zero_typo.phrase + } + pub fn all_computed_derivations(&self) -> (Vec>, Vec>) { let mut words = BTreeSet::new(); let mut phrases = BTreeSet::new(); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index bdb937384..86be7da77 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -9,21 +9,34 @@ use crate::search::new::query_term::{Lazy, Phrase, QueryTerm}; use crate::search::new::Word; use crate::{Result, SearchContext, MAX_WORD_LENGTH}; +#[derive(Clone)] +/// Extraction of the content of a query. +pub struct ExtractedTokens { + /// The terms to search for in the database. + pub query_terms: Vec, + /// The words that must not appear in the results. + pub negative_words: Vec, + /// The phrases that must not appear in the results. + pub negative_phrases: Vec, +} + /// Convert the tokenised search query into a list of located query terms. #[tracing::instrument(level = "trace", skip_all, target = "search::query")] pub fn located_query_terms_from_tokens( ctx: &mut SearchContext, query: NormalizedTokenIter, words_limit: Option, -) -> Result<(Vec, Vec)> { +) -> Result { let nbr_typos = number_of_typos_allowed(ctx)?; - let mut located_terms = Vec::new(); + let mut query_terms = Vec::new(); + let mut negative_phrase = false; let mut phrase: Option = None; let mut encountered_whitespace = true; let mut negative_next_token = false; let mut negative_words = Vec::new(); + let mut negative_phrases = Vec::new(); let parts_limit = words_limit.unwrap_or(usize::MAX); @@ -37,8 +50,8 @@ pub fn located_query_terms_from_tokens( } // early return if word limit is exceeded - if located_terms.len() >= parts_limit { - return Ok((located_terms, negative_words)); + if query_terms.len() >= parts_limit { + return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases }); } match token.kind { @@ -71,7 +84,7 @@ pub fn located_query_terms_from_tokens( value: ctx.term_interner.push(term), positions: position..=position, }; - located_terms.push(located_term); + query_terms.push(located_term); } TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (), } @@ -88,7 +101,7 @@ pub fn located_query_terms_from_tokens( value: ctx.term_interner.push(term), positions: position..=position, }; - located_terms.push(located_term); + query_terms.push(located_term); } } TokenKind::Separator(separator_kind) => { @@ -104,7 +117,14 @@ pub fn located_query_terms_from_tokens( let phrase = if separator_kind == SeparatorKind::Hard { if let Some(phrase) = phrase { if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) + // as we are evaluating a negative operator we put the phrase + // in the negative one *but* we don't reset the negative operator + // as we are immediatly starting a new negative phrase. + if negative_phrase { + negative_phrases.push(located_query_term); + } else { + query_terms.push(located_query_term); + } } Some(PhraseBuilder::empty()) } else { @@ -125,12 +145,24 @@ pub fn located_query_terms_from_tokens( // Per the check above, quote_count > 0 quote_count -= 1; if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term) + // we were evaluating a negative operator so we + // put the phrase in the negative phrases + if negative_phrase { + negative_phrases.push(located_query_term); + negative_phrase = false; + } else { + query_terms.push(located_query_term); + } } } // Start new phrase if the token ends with an opening quote - (quote_count % 2 == 1).then_some(PhraseBuilder::empty()) + if quote_count % 2 == 1 { + negative_phrase = negative_next_token; + Some(PhraseBuilder::empty()) + } else { + None + } }; negative_next_token = @@ -146,11 +178,16 @@ pub fn located_query_terms_from_tokens( // If a quote is never closed, we consider all of the end of the query as a phrase. if let Some(phrase) = phrase.take() { if let Some(located_query_term) = phrase.build(ctx) { - located_terms.push(located_query_term); + // put the phrase in the negative set if we are evaluating a negative operator. + if negative_phrase { + negative_phrases.push(located_query_term); + } else { + query_terms.push(located_query_term); + } } } - Ok((located_terms, negative_words)) + Ok(ExtractedTokens { query_terms, negative_words, negative_phrases }) } pub fn number_of_typos_allowed<'ctx>( @@ -331,8 +368,9 @@ mod tests { let rtxn = index.read_txn()?; let mut ctx = SearchContext::new(&index, &rtxn); // panics with `attempt to add with overflow` before - let (located_query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None)?; - assert!(located_query_terms.is_empty()); + let ExtractedTokens { query_terms, .. } = + located_query_terms_from_tokens(&mut ctx, tokens, None)?; + assert!(query_terms.is_empty()); Ok(()) } From 90e812fc0bae7ec1812df325736e0a83196df551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 4 Apr 2024 12:01:04 +0200 Subject: [PATCH 6/6] Add some tests --- meilisearch/tests/search/mod.rs | 104 ++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/meilisearch/tests/search/mod.rs b/meilisearch/tests/search/mod.rs index 88470187a..07de60d7c 100644 --- a/meilisearch/tests/search/mod.rs +++ b/meilisearch/tests/search/mod.rs @@ -184,6 +184,110 @@ async fn phrase_search_with_stop_word() { .await; } +#[actix_rt::test] +async fn negative_phrase_search() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(0).await; + + index + .search(json!({"q": "-\"train your dragon\"" }), |response, code| { + assert_eq!(code, 200, "{}", response); + let hits = response["hits"].as_array().unwrap(); + assert_eq!(hits.len(), 4); + assert_eq!(hits[0]["id"], "287947"); + assert_eq!(hits[1]["id"], "299537"); + assert_eq!(hits[2]["id"], "522681"); + assert_eq!(hits[3]["id"], "450465"); + }) + .await; +} + +#[actix_rt::test] +async fn negative_word_search() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(0).await; + + index + .search(json!({"q": "-escape" }), |response, code| { + assert_eq!(code, 200, "{}", response); + let hits = response["hits"].as_array().unwrap(); + assert_eq!(hits.len(), 4); + assert_eq!(hits[0]["id"], "287947"); + assert_eq!(hits[1]["id"], "299537"); + assert_eq!(hits[2]["id"], "166428"); + assert_eq!(hits[3]["id"], "450465"); + }) + .await; + + // Everything that contains derivates of escape but not escape: nothing + index + .search(json!({"q": "-escape escape" }), |response, code| { + assert_eq!(code, 200, "{}", response); + let hits = response["hits"].as_array().unwrap(); + assert_eq!(hits.len(), 0); + }) + .await; +} + +#[actix_rt::test] +async fn non_negative_search() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(0).await; + + index + .search(json!({"q": "- escape" }), |response, code| { + assert_eq!(code, 200, "{}", response); + let hits = response["hits"].as_array().unwrap(); + assert_eq!(hits.len(), 1); + assert_eq!(hits[0]["id"], "522681"); + }) + .await; + + index + .search(json!({"q": "- \"train your dragon\"" }), |response, code| { + assert_eq!(code, 200, "{}", response); + let hits = response["hits"].as_array().unwrap(); + assert_eq!(hits.len(), 1); + assert_eq!(hits[0]["id"], "166428"); + }) + .await; +} + +#[actix_rt::test] +async fn negative_special_cases_search() { + let server = Server::new().await; + let index = server.index("test"); + + let documents = DOCUMENTS.clone(); + index.add_documents(documents, None).await; + index.wait_task(0).await; + + index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await; + index.wait_task(1).await; + + // There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass + index + .search(json!({"q": "-escape escape" }), |response, code| { + assert_eq!(code, 200, "{}", response); + let hits = response["hits"].as_array().unwrap(); + assert_eq!(hits.len(), 1); + assert_eq!(hits[0]["id"], "450465"); + }) + .await; +} + #[cfg(feature = "default")] #[actix_rt::test] async fn test_kanji_language_detection() {