Merge #4535

4535: Support Negative Keywords r=ManyTheFish a=Kerollmops This PR fixes #4422 by supporting `-` before any word in the query. The minus symbol `-`, from the ASCII table, is not the only character that can be considered the negative operator. You can see the two other matching characters under the `Based on "-" (U+002D)` section on [this unicode reference website](https://www.compart.com/en/unicode/U+002D). It's important to notice the strange behavior when a query includes and excludes the same word; only the derivative ( synonyms and split) will be kept: - If you input `progamer -progamer`, the engine will still search for `pro gamer`. - If you have the synonym `like = love` and you input `like -like`, it will still search for `love`. ## TODO - [x] Add analytics - [x] Add support to the `-` operator - [x] Make sure to support spaces around `-` well - [x] Support phrase negation - [x] Add tests Co-authored-by: Clément Renault <clement@meilisearch.com>
2025-07-03 11:57:07 +02:00 · 2024-04-04 13:10:27 +00:00 · 2024-04-04 13:10:27 +00:00 · 5509bafff8
commit 5509bafff8
parent 56bf8503db 90e812fc0b
10 changed files with 260 additions and 20 deletions
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -2435,6 +2435,7 @@ pub(crate) mod tests {
            document_scores: _,
            mut documents_ids,
            degraded: _,
+            used_negative_operator: _,
        } = search.execute().unwrap();
        let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
        documents_ids.sort_unstable();
--- a/milli/src/search/hybrid.rs
+++ b/milli/src/search/hybrid.rs
@ -11,6 +11,7 @@ struct ScoreWithRatioResult {
    candidates: RoaringBitmap,
    document_scores: Vec<(u32, ScoreWithRatio)>,
    degraded: bool,
+    used_negative_operator: bool,
 }

 type ScoreWithRatio = (Vec<ScoreDetails>, f32);
@ -78,6 +79,7 @@ impl ScoreWithRatioResult {
            candidates: results.candidates,
            document_scores,
            degraded: results.degraded,
+            used_negative_operator: results.used_negative_operator,
        }
    }

@ -113,6 +115,7 @@ impl ScoreWithRatioResult {
            documents_ids,
            document_scores,
            degraded: left.degraded | right.degraded,
+            used_negative_operator: left.used_negative_operator | right.used_negative_operator,
        }
    }
 }
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -183,6 +183,7 @@ impl<'a> Search<'a> {
            documents_ids,
            document_scores,
            degraded,
+            used_negative_operator,
        } = match self.vector.as_ref() {
            Some(vector) => execute_vector_search(
                &mut ctx,
@ -221,7 +222,14 @@ impl<'a> Search<'a> {
            None => MatchingWords::default(),
        };

-        Ok(SearchResult { matching_words, candidates, document_scores, documents_ids, degraded })
+        Ok(SearchResult {
+            matching_words,
+            candidates,
+            document_scores,
+            documents_ids,
+            degraded,
+            used_negative_operator,
+        })
    }
 }

@ -272,6 +280,7 @@ pub struct SearchResult {
    pub documents_ids: Vec<DocumentId>,
    pub document_scores: Vec<Vec<ScoreDetails>>,
    pub degraded: bool,
+    pub used_negative_operator: bool,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
--- a/milli/src/search/new/matches/matching_words.rs
+++ b/milli/src/search/new/matches/matching_words.rs
@ -240,6 +240,7 @@ pub(crate) mod tests {
    use super::super::super::located_query_terms_from_tokens;
    use super::*;
    use crate::index::tests::TempIndex;
+    use crate::search::new::query_term::ExtractedTokens;

    pub(crate) fn temp_index_with_documents() -> TempIndex {
        let temp_index = TempIndex::new();
@ -261,7 +262,8 @@ pub(crate) mod tests {
        let mut builder = TokenizerBuilder::default();
        let tokenizer = builder.build();
        let tokens = tokenizer.tokenize("split this world");
-        let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
+        let ExtractedTokens { query_terms, .. } =
+            located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
        let matching_words = MatchingWords::new(ctx, query_terms);

        assert_eq!(
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -33,7 +33,9 @@ use interner::{DedupInterner, Interner};
 pub use logger::visual::VisualSearchLogger;
 pub use logger::{DefaultSearchLogger, SearchLogger};
 use query_graph::{QueryGraph, QueryNode};
-use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm};
+use query_term::{
+    located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm,
+};
 use ranking_rules::{
    BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
 };
@ -209,6 +211,35 @@ fn resolve_universe(
    )
 }

+#[tracing::instrument(level = "trace", skip_all, target = "search")]
+fn resolve_negative_words(
+    ctx: &mut SearchContext,
+    negative_words: &[Word],
+) -> Result<RoaringBitmap> {
+    let mut negative_bitmap = RoaringBitmap::new();
+    for &word in negative_words {
+        if let Some(bitmap) = ctx.word_docids(word)? {
+            negative_bitmap |= bitmap;
+        }
+    }
+    Ok(negative_bitmap)
+}
+
+#[tracing::instrument(level = "trace", skip_all, target = "search")]
+fn resolve_negative_phrases(
+    ctx: &mut SearchContext,
+    negative_phrases: &[LocatedQueryTerm],
+) -> Result<RoaringBitmap> {
+    let mut negative_bitmap = RoaringBitmap::new();
+    for term in negative_phrases {
+        let query_term = ctx.term_interner.get(term.value);
+        if let Some(phrase) = query_term.original_phrase() {
+            negative_bitmap |= ctx.get_phrase_docids(phrase)?;
+        }
+    }
+    Ok(negative_bitmap)
+}
+
 /// Return the list of initialised ranking rules to be used for a placeholder search.
 fn get_ranking_rules_for_placeholder_search<'ctx>(
    ctx: &SearchContext<'ctx>,
@ -557,6 +588,7 @@ pub fn execute_vector_search(
        documents_ids: docids,
        located_query_terms: None,
        degraded,
+        used_negative_operator: false,
    })
 }

@ -580,6 +612,7 @@ pub fn execute_search(
 ) -> Result<PartialSearchResult> {
    check_sort_criteria(ctx, sort_criteria.as_ref())?;

+    let mut used_negative_operator = false;
    let mut located_query_terms = None;
    let query_terms = if let Some(query) = query {
        let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
@ -620,7 +653,16 @@ pub fn execute_search(
        let tokens = tokenizer.tokenize(query);
        drop(entered);

-        let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?;
+        let ExtractedTokens { query_terms, negative_words, negative_phrases } =
+            located_query_terms_from_tokens(ctx, tokens, words_limit)?;
+        used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
+
+        let ignored_documents = resolve_negative_words(ctx, &negative_words)?;
+        let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
+
+        universe -= ignored_documents;
+        universe -= ignored_phrases;
+
        if query_terms.is_empty() {
            // Do a placeholder search instead
            None
@ -630,6 +672,7 @@ pub fn execute_search(
    } else {
        None
    };
+
    let bucket_sort_output = if let Some(query_terms) = query_terms {
        let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
        located_query_terms = Some(new_located_query_terms);
@ -690,6 +733,7 @@ pub fn execute_search(
        documents_ids: docids,
        located_query_terms,
        degraded,
+        used_negative_operator,
    })
 }

@ -752,4 +796,5 @@ pub struct PartialSearchResult {
    pub document_scores: Vec<Vec<ScoreDetails>>,

    pub degraded: bool,
+    pub used_negative_operator: bool,
 }
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@ -9,7 +9,9 @@ use std::ops::RangeInclusive;

 use either::Either;
 pub use ntypo_subset::NTypoTermSubset;
-pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed};
+pub use parse_query::{
+    located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens,
+};
 pub use phrase::Phrase;

 use super::interner::{DedupInterner, Interned};
@ -478,6 +480,11 @@ impl QueryTerm {
    pub fn original_word(&self, ctx: &SearchContext) -> String {
        ctx.word_interner.get(self.original).clone()
    }
+
+    pub fn original_phrase(&self) -> Option<Interned<Phrase>> {
+        self.zero_typo.phrase
+    }
+
    pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
        let mut words = BTreeSet::new();
        let mut phrases = BTreeSet::new();
--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -6,20 +6,37 @@ use charabia::{SeparatorKind, TokenKind};
 use super::compute_derivations::partially_initialized_term_from_word;
 use super::{LocatedQueryTerm, ZeroTypoTerm};
 use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
+use crate::search::new::Word;
 use crate::{Result, SearchContext, MAX_WORD_LENGTH};

+#[derive(Clone)]
+/// Extraction of the content of a query.
+pub struct ExtractedTokens {
+    /// The terms to search for in the database.
+    pub query_terms: Vec<LocatedQueryTerm>,
+    /// The words that must not appear in the results.
+    pub negative_words: Vec<Word>,
+    /// The phrases that must not appear in the results.
+    pub negative_phrases: Vec<LocatedQueryTerm>,
+}
+
 /// Convert the tokenised search query into a list of located query terms.
 #[tracing::instrument(level = "trace", skip_all, target = "search::query")]
 pub fn located_query_terms_from_tokens(
    ctx: &mut SearchContext,
    query: NormalizedTokenIter,
    words_limit: Option<usize>,
-) -> Result<Vec<LocatedQueryTerm>> {
+) -> Result<ExtractedTokens> {
    let nbr_typos = number_of_typos_allowed(ctx)?;

-    let mut located_terms = Vec::new();
+    let mut query_terms = Vec::new();

+    let mut negative_phrase = false;
    let mut phrase: Option<PhraseBuilder> = None;
+    let mut encountered_whitespace = true;
+    let mut negative_next_token = false;
+    let mut negative_words = Vec::new();
+    let mut negative_phrases = Vec::new();

    let parts_limit = words_limit.unwrap_or(usize::MAX);

@ -31,9 +48,10 @@ pub fn located_query_terms_from_tokens(
        if token.lemma().is_empty() {
            continue;
        }
+
        // early return if word limit is exceeded
-        if located_terms.len() >= parts_limit {
-            return Ok(located_terms);
+        if query_terms.len() >= parts_limit {
+            return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases });
        }

        match token.kind {
@ -46,6 +64,11 @@ pub fn located_query_terms_from_tokens(
                // 3. if the word is the last token of the query we push it as a prefix word.
                if let Some(phrase) = &mut phrase {
                    phrase.push_word(ctx, &token, position)
+                } else if negative_next_token {
+                    let word = token.lemma().to_string();
+                    let word = Word::Original(ctx.word_interner.insert(word));
+                    negative_words.push(word);
+                    negative_next_token = false;
                } else if peekable.peek().is_some() {
                    match token.kind {
                        TokenKind::Word => {
@ -61,9 +84,9 @@ pub fn located_query_terms_from_tokens(
                                value: ctx.term_interner.push(term),
                                positions: position..=position,
                            };
-                            located_terms.push(located_term);
+                            query_terms.push(located_term);
                        }
-                        TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
+                        TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
                    }
                } else {
                    let word = token.lemma();
@ -78,7 +101,7 @@ pub fn located_query_terms_from_tokens(
                        value: ctx.term_interner.push(term),
                        positions: position..=position,
                    };
-                    located_terms.push(located_term);
+                    query_terms.push(located_term);
                }
            }
            TokenKind::Separator(separator_kind) => {
@ -94,7 +117,14 @@ pub fn located_query_terms_from_tokens(
                    let phrase = if separator_kind == SeparatorKind::Hard {
                        if let Some(phrase) = phrase {
                            if let Some(located_query_term) = phrase.build(ctx) {
-                                located_terms.push(located_query_term)
+                                // as we are evaluating a negative operator we put the phrase
+                                // in the negative one *but* we don't reset the negative operator
+                                // as we are immediatly starting a new negative phrase.
+                                if negative_phrase {
+                                    negative_phrases.push(located_query_term);
+                                } else {
+                                    query_terms.push(located_query_term);
+                                }
                            }
                            Some(PhraseBuilder::empty())
                        } else {
@ -115,26 +145,49 @@ pub fn located_query_terms_from_tokens(
                        // Per the check above, quote_count > 0
                        quote_count -= 1;
                        if let Some(located_query_term) = phrase.build(ctx) {
-                            located_terms.push(located_query_term)
+                            // we were evaluating a negative operator so we
+                            // put the phrase in the negative phrases
+                            if negative_phrase {
+                                negative_phrases.push(located_query_term);
+                                negative_phrase = false;
+                            } else {
+                                query_terms.push(located_query_term);
+                            }
                        }
                    }

                    // Start new phrase if the token ends with an opening quote
-                    (quote_count % 2 == 1).then_some(PhraseBuilder::empty())
+                    if quote_count % 2 == 1 {
+                        negative_phrase = negative_next_token;
+                        Some(PhraseBuilder::empty())
+                    } else {
+                        None
+                    }
                };
+
+                negative_next_token =
+                    phrase.is_none() && token.lemma() == "-" && encountered_whitespace;
            }
            _ => (),
        }
+
+        encountered_whitespace =
+            token.lemma().chars().last().filter(|c| c.is_whitespace()).is_some();
    }

    // If a quote is never closed, we consider all of the end of the query as a phrase.
    if let Some(phrase) = phrase.take() {
        if let Some(located_query_term) = phrase.build(ctx) {
-            located_terms.push(located_query_term);
+            // put the phrase in the negative set if we are evaluating a negative operator.
+            if negative_phrase {
+                negative_phrases.push(located_query_term);
+            } else {
+                query_terms.push(located_query_term);
+            }
        }
    }

-    Ok(located_terms)
+    Ok(ExtractedTokens { query_terms, negative_words, negative_phrases })
 }

 pub fn number_of_typos_allowed<'ctx>(
@ -315,8 +368,10 @@ mod tests {
        let rtxn = index.read_txn()?;
        let mut ctx = SearchContext::new(&index, &rtxn);
        // panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
-        let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
-        assert!(located_query_terms.is_empty());
+        let ExtractedTokens { query_terms, .. } =
+            located_query_terms_from_tokens(&mut ctx, tokens, None)?;
+        assert!(query_terms.is_empty());
+
        Ok(())
    }
 }