mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 05:54:30 +01:00
Support negative phrases
This commit is contained in:
parent
69f8b2730d
commit
877f4b1045
@ -240,6 +240,7 @@ pub(crate) mod tests {
|
||||
use super::super::super::located_query_terms_from_tokens;
|
||||
use super::*;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::search::new::query_term::ExtractedTokens;
|
||||
|
||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||
let temp_index = TempIndex::new();
|
||||
@ -261,7 +262,8 @@ pub(crate) mod tests {
|
||||
let mut builder = TokenizerBuilder::default();
|
||||
let tokenizer = builder.build();
|
||||
let tokens = tokenizer.tokenize("split this world");
|
||||
let (query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||
|
||||
assert_eq!(
|
||||
|
@ -33,7 +33,9 @@ use interner::{DedupInterner, Interner};
|
||||
pub use logger::visual::VisualSearchLogger;
|
||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||
use query_graph::{QueryGraph, QueryNode};
|
||||
use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm};
|
||||
use query_term::{
|
||||
located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm,
|
||||
};
|
||||
use ranking_rules::{
|
||||
BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
|
||||
};
|
||||
@ -223,6 +225,21 @@ fn resolve_negative_words(
|
||||
Ok(negative_bitmap)
|
||||
}
|
||||
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search")]
|
||||
fn resolve_negative_phrases(
|
||||
ctx: &mut SearchContext,
|
||||
negative_phrases: &[LocatedQueryTerm],
|
||||
) -> Result<RoaringBitmap> {
|
||||
let mut negative_bitmap = RoaringBitmap::new();
|
||||
for term in negative_phrases {
|
||||
let query_term = ctx.term_interner.get(term.value);
|
||||
if let Some(phrase) = query_term.original_phrase() {
|
||||
negative_bitmap |= ctx.get_phrase_docids(phrase)?;
|
||||
}
|
||||
}
|
||||
Ok(negative_bitmap)
|
||||
}
|
||||
|
||||
/// Return the list of initialised ranking rules to be used for a placeholder search.
|
||||
fn get_ranking_rules_for_placeholder_search<'ctx>(
|
||||
ctx: &SearchContext<'ctx>,
|
||||
@ -636,12 +653,15 @@ pub fn execute_search(
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
drop(entered);
|
||||
|
||||
let (query_terms, negative_words) =
|
||||
let ExtractedTokens { query_terms, negative_words, negative_phrases } =
|
||||
located_query_terms_from_tokens(ctx, tokens, words_limit)?;
|
||||
used_negative_operator = !negative_words.is_empty();
|
||||
used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
|
||||
|
||||
let ignored_documents = resolve_negative_words(ctx, &negative_words)?;
|
||||
let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
|
||||
|
||||
universe -= ignored_documents;
|
||||
universe -= ignored_phrases;
|
||||
|
||||
if query_terms.is_empty() {
|
||||
// Do a placeholder search instead
|
||||
|
@ -9,7 +9,9 @@ use std::ops::RangeInclusive;
|
||||
|
||||
use either::Either;
|
||||
pub use ntypo_subset::NTypoTermSubset;
|
||||
pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed};
|
||||
pub use parse_query::{
|
||||
located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens,
|
||||
};
|
||||
pub use phrase::Phrase;
|
||||
|
||||
use super::interner::{DedupInterner, Interned};
|
||||
@ -478,6 +480,11 @@ impl QueryTerm {
|
||||
pub fn original_word(&self, ctx: &SearchContext) -> String {
|
||||
ctx.word_interner.get(self.original).clone()
|
||||
}
|
||||
|
||||
pub fn original_phrase(&self) -> Option<Interned<Phrase>> {
|
||||
self.zero_typo.phrase
|
||||
}
|
||||
|
||||
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
|
||||
let mut words = BTreeSet::new();
|
||||
let mut phrases = BTreeSet::new();
|
||||
|
@ -9,21 +9,34 @@ use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
|
||||
use crate::search::new::Word;
|
||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Extraction of the content of a query.
|
||||
pub struct ExtractedTokens {
|
||||
/// The terms to search for in the database.
|
||||
pub query_terms: Vec<LocatedQueryTerm>,
|
||||
/// The words that must not appear in the results.
|
||||
pub negative_words: Vec<Word>,
|
||||
/// The phrases that must not appear in the results.
|
||||
pub negative_phrases: Vec<LocatedQueryTerm>,
|
||||
}
|
||||
|
||||
/// Convert the tokenised search query into a list of located query terms.
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||
pub fn located_query_terms_from_tokens(
|
||||
ctx: &mut SearchContext,
|
||||
query: NormalizedTokenIter,
|
||||
words_limit: Option<usize>,
|
||||
) -> Result<(Vec<LocatedQueryTerm>, Vec<Word>)> {
|
||||
) -> Result<ExtractedTokens> {
|
||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||
|
||||
let mut located_terms = Vec::new();
|
||||
let mut query_terms = Vec::new();
|
||||
|
||||
let mut negative_phrase = false;
|
||||
let mut phrase: Option<PhraseBuilder> = None;
|
||||
let mut encountered_whitespace = true;
|
||||
let mut negative_next_token = false;
|
||||
let mut negative_words = Vec::new();
|
||||
let mut negative_phrases = Vec::new();
|
||||
|
||||
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
||||
|
||||
@ -37,8 +50,8 @@ pub fn located_query_terms_from_tokens(
|
||||
}
|
||||
|
||||
// early return if word limit is exceeded
|
||||
if located_terms.len() >= parts_limit {
|
||||
return Ok((located_terms, negative_words));
|
||||
if query_terms.len() >= parts_limit {
|
||||
return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases });
|
||||
}
|
||||
|
||||
match token.kind {
|
||||
@ -71,7 +84,7 @@ pub fn located_query_terms_from_tokens(
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
located_terms.push(located_term);
|
||||
query_terms.push(located_term);
|
||||
}
|
||||
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
|
||||
}
|
||||
@ -88,7 +101,7 @@ pub fn located_query_terms_from_tokens(
|
||||
value: ctx.term_interner.push(term),
|
||||
positions: position..=position,
|
||||
};
|
||||
located_terms.push(located_term);
|
||||
query_terms.push(located_term);
|
||||
}
|
||||
}
|
||||
TokenKind::Separator(separator_kind) => {
|
||||
@ -104,7 +117,14 @@ pub fn located_query_terms_from_tokens(
|
||||
let phrase = if separator_kind == SeparatorKind::Hard {
|
||||
if let Some(phrase) = phrase {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
located_terms.push(located_query_term)
|
||||
// as we are evaluating a negative operator we put the phrase
|
||||
// in the negative one *but* we don't reset the negative operator
|
||||
// as we are immediatly starting a new negative phrase.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
Some(PhraseBuilder::empty())
|
||||
} else {
|
||||
@ -125,12 +145,24 @@ pub fn located_query_terms_from_tokens(
|
||||
// Per the check above, quote_count > 0
|
||||
quote_count -= 1;
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
located_terms.push(located_query_term)
|
||||
// we were evaluating a negative operator so we
|
||||
// put the phrase in the negative phrases
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
negative_phrase = false;
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Start new phrase if the token ends with an opening quote
|
||||
(quote_count % 2 == 1).then_some(PhraseBuilder::empty())
|
||||
if quote_count % 2 == 1 {
|
||||
negative_phrase = negative_next_token;
|
||||
Some(PhraseBuilder::empty())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
};
|
||||
|
||||
negative_next_token =
|
||||
@ -146,11 +178,16 @@ pub fn located_query_terms_from_tokens(
|
||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||
if let Some(phrase) = phrase.take() {
|
||||
if let Some(located_query_term) = phrase.build(ctx) {
|
||||
located_terms.push(located_query_term);
|
||||
// put the phrase in the negative set if we are evaluating a negative operator.
|
||||
if negative_phrase {
|
||||
negative_phrases.push(located_query_term);
|
||||
} else {
|
||||
query_terms.push(located_query_term);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok((located_terms, negative_words))
|
||||
Ok(ExtractedTokens { query_terms, negative_words, negative_phrases })
|
||||
}
|
||||
|
||||
pub fn number_of_typos_allowed<'ctx>(
|
||||
@ -331,8 +368,9 @@ mod tests {
|
||||
let rtxn = index.read_txn()?;
|
||||
let mut ctx = SearchContext::new(&index, &rtxn);
|
||||
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
|
||||
let (located_query_terms, _) = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
|
||||
assert!(located_query_terms.is_empty());
|
||||
let ExtractedTokens { query_terms, .. } =
|
||||
located_query_terms_from_tokens(&mut ctx, tokens, None)?;
|
||||
assert!(query_terms.is_empty());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user