mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Merge #4535
4535: Support Negative Keywords r=ManyTheFish a=Kerollmops This PR fixes #4422 by supporting `-` before any word in the query. The minus symbol `-`, from the ASCII table, is not the only character that can be considered the negative operator. You can see the two other matching characters under the `Based on "-" (U+002D)` section on [this unicode reference website](https://www.compart.com/en/unicode/U+002D). It's important to notice the strange behavior when a query includes and excludes the same word; only the derivative ( synonyms and split) will be kept: - If you input `progamer -progamer`, the engine will still search for `pro gamer`. - If you have the synonym `like = love` and you input `like -like`, it will still search for `love`. ## TODO - [x] Add analytics - [x] Add support to the `-` operator - [x] Make sure to support spaces around `-` well - [x] Support phrase negation - [x] Add tests Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
commit
5509bafff8
@ -583,6 +583,7 @@ pub struct SearchAggregator {
|
|||||||
total_received: usize,
|
total_received: usize,
|
||||||
total_succeeded: usize,
|
total_succeeded: usize,
|
||||||
total_degraded: usize,
|
total_degraded: usize,
|
||||||
|
total_used_negative_operator: usize,
|
||||||
time_spent: BinaryHeap<usize>,
|
time_spent: BinaryHeap<usize>,
|
||||||
|
|
||||||
// sort
|
// sort
|
||||||
@ -763,12 +764,16 @@ impl SearchAggregator {
|
|||||||
facet_distribution: _,
|
facet_distribution: _,
|
||||||
facet_stats: _,
|
facet_stats: _,
|
||||||
degraded,
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
} = result;
|
} = result;
|
||||||
|
|
||||||
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
self.total_succeeded = self.total_succeeded.saturating_add(1);
|
||||||
if *degraded {
|
if *degraded {
|
||||||
self.total_degraded = self.total_degraded.saturating_add(1);
|
self.total_degraded = self.total_degraded.saturating_add(1);
|
||||||
}
|
}
|
||||||
|
if *used_negative_operator {
|
||||||
|
self.total_used_negative_operator = self.total_used_negative_operator.saturating_add(1);
|
||||||
|
}
|
||||||
self.time_spent.push(*processing_time_ms as usize);
|
self.time_spent.push(*processing_time_ms as usize);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -811,6 +816,7 @@ impl SearchAggregator {
|
|||||||
embedder,
|
embedder,
|
||||||
hybrid,
|
hybrid,
|
||||||
total_degraded,
|
total_degraded,
|
||||||
|
total_used_negative_operator,
|
||||||
} = other;
|
} = other;
|
||||||
|
|
||||||
if self.timestamp.is_none() {
|
if self.timestamp.is_none() {
|
||||||
@ -826,6 +832,8 @@ impl SearchAggregator {
|
|||||||
self.total_received = self.total_received.saturating_add(total_received);
|
self.total_received = self.total_received.saturating_add(total_received);
|
||||||
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
|
self.total_succeeded = self.total_succeeded.saturating_add(total_succeeded);
|
||||||
self.total_degraded = self.total_degraded.saturating_add(total_degraded);
|
self.total_degraded = self.total_degraded.saturating_add(total_degraded);
|
||||||
|
self.total_used_negative_operator =
|
||||||
|
self.total_used_negative_operator.saturating_add(total_used_negative_operator);
|
||||||
self.time_spent.append(time_spent);
|
self.time_spent.append(time_spent);
|
||||||
|
|
||||||
// sort
|
// sort
|
||||||
@ -932,6 +940,7 @@ impl SearchAggregator {
|
|||||||
embedder,
|
embedder,
|
||||||
hybrid,
|
hybrid,
|
||||||
total_degraded,
|
total_degraded,
|
||||||
|
total_used_negative_operator,
|
||||||
} = self;
|
} = self;
|
||||||
|
|
||||||
if total_received == 0 {
|
if total_received == 0 {
|
||||||
@ -952,6 +961,7 @@ impl SearchAggregator {
|
|||||||
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
"total_failed": total_received.saturating_sub(total_succeeded), // just to be sure we never panics
|
||||||
"total_received": total_received,
|
"total_received": total_received,
|
||||||
"total_degraded": total_degraded,
|
"total_degraded": total_degraded,
|
||||||
|
"total_used_negative_operator": total_used_negative_operator,
|
||||||
},
|
},
|
||||||
"sort": {
|
"sort": {
|
||||||
"with_geoPoint": sort_with_geo_point,
|
"with_geoPoint": sort_with_geo_point,
|
||||||
|
@ -324,9 +324,11 @@ pub struct SearchResult {
|
|||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
pub facet_stats: Option<BTreeMap<String, FacetStats>>,
|
pub facet_stats: Option<BTreeMap<String, FacetStats>>,
|
||||||
|
|
||||||
// This information is only used for analytics purposes
|
// These fields are only used for analytics purposes
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub degraded: bool,
|
pub degraded: bool,
|
||||||
|
#[serde(skip)]
|
||||||
|
pub used_negative_operator: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Debug, Clone, PartialEq)]
|
#[derive(Serialize, Debug, Clone, PartialEq)]
|
||||||
@ -512,6 +514,7 @@ pub fn perform_search(
|
|||||||
candidates,
|
candidates,
|
||||||
document_scores,
|
document_scores,
|
||||||
degraded,
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
..
|
..
|
||||||
} = match &query.hybrid {
|
} = match &query.hybrid {
|
||||||
Some(hybrid) => match *hybrid.semantic_ratio {
|
Some(hybrid) => match *hybrid.semantic_ratio {
|
||||||
@ -717,6 +720,7 @@ pub fn perform_search(
|
|||||||
facet_distribution,
|
facet_distribution,
|
||||||
facet_stats,
|
facet_stats,
|
||||||
degraded,
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
};
|
};
|
||||||
Ok(result)
|
Ok(result)
|
||||||
}
|
}
|
||||||
|
@ -185,6 +185,110 @@ async fn phrase_search_with_stop_word() {
|
|||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn negative_phrase_search() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
let documents = DOCUMENTS.clone();
|
||||||
|
index.add_documents(documents, None).await;
|
||||||
|
index.wait_task(0).await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "-\"train your dragon\"" }), |response, code| {
|
||||||
|
assert_eq!(code, 200, "{}", response);
|
||||||
|
let hits = response["hits"].as_array().unwrap();
|
||||||
|
assert_eq!(hits.len(), 4);
|
||||||
|
assert_eq!(hits[0]["id"], "287947");
|
||||||
|
assert_eq!(hits[1]["id"], "299537");
|
||||||
|
assert_eq!(hits[2]["id"], "522681");
|
||||||
|
assert_eq!(hits[3]["id"], "450465");
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn negative_word_search() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
let documents = DOCUMENTS.clone();
|
||||||
|
index.add_documents(documents, None).await;
|
||||||
|
index.wait_task(0).await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "-escape" }), |response, code| {
|
||||||
|
assert_eq!(code, 200, "{}", response);
|
||||||
|
let hits = response["hits"].as_array().unwrap();
|
||||||
|
assert_eq!(hits.len(), 4);
|
||||||
|
assert_eq!(hits[0]["id"], "287947");
|
||||||
|
assert_eq!(hits[1]["id"], "299537");
|
||||||
|
assert_eq!(hits[2]["id"], "166428");
|
||||||
|
assert_eq!(hits[3]["id"], "450465");
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Everything that contains derivates of escape but not escape: nothing
|
||||||
|
index
|
||||||
|
.search(json!({"q": "-escape escape" }), |response, code| {
|
||||||
|
assert_eq!(code, 200, "{}", response);
|
||||||
|
let hits = response["hits"].as_array().unwrap();
|
||||||
|
assert_eq!(hits.len(), 0);
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn non_negative_search() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
let documents = DOCUMENTS.clone();
|
||||||
|
index.add_documents(documents, None).await;
|
||||||
|
index.wait_task(0).await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "- escape" }), |response, code| {
|
||||||
|
assert_eq!(code, 200, "{}", response);
|
||||||
|
let hits = response["hits"].as_array().unwrap();
|
||||||
|
assert_eq!(hits.len(), 1);
|
||||||
|
assert_eq!(hits[0]["id"], "522681");
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
|
||||||
|
index
|
||||||
|
.search(json!({"q": "- \"train your dragon\"" }), |response, code| {
|
||||||
|
assert_eq!(code, 200, "{}", response);
|
||||||
|
let hits = response["hits"].as_array().unwrap();
|
||||||
|
assert_eq!(hits.len(), 1);
|
||||||
|
assert_eq!(hits[0]["id"], "166428");
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
|
#[actix_rt::test]
|
||||||
|
async fn negative_special_cases_search() {
|
||||||
|
let server = Server::new().await;
|
||||||
|
let index = server.index("test");
|
||||||
|
|
||||||
|
let documents = DOCUMENTS.clone();
|
||||||
|
index.add_documents(documents, None).await;
|
||||||
|
index.wait_task(0).await;
|
||||||
|
|
||||||
|
index.update_settings(json!({"synonyms": { "escape": ["glass"] }})).await;
|
||||||
|
index.wait_task(1).await;
|
||||||
|
|
||||||
|
// There is a synonym for escape -> glass but we don't want "escape", only the derivates: glass
|
||||||
|
index
|
||||||
|
.search(json!({"q": "-escape escape" }), |response, code| {
|
||||||
|
assert_eq!(code, 200, "{}", response);
|
||||||
|
let hits = response["hits"].as_array().unwrap();
|
||||||
|
assert_eq!(hits.len(), 1);
|
||||||
|
assert_eq!(hits[0]["id"], "450465");
|
||||||
|
})
|
||||||
|
.await;
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(feature = "default")]
|
#[cfg(feature = "default")]
|
||||||
#[actix_rt::test]
|
#[actix_rt::test]
|
||||||
async fn test_kanji_language_detection() {
|
async fn test_kanji_language_detection() {
|
||||||
|
@ -2435,6 +2435,7 @@ pub(crate) mod tests {
|
|||||||
document_scores: _,
|
document_scores: _,
|
||||||
mut documents_ids,
|
mut documents_ids,
|
||||||
degraded: _,
|
degraded: _,
|
||||||
|
used_negative_operator: _,
|
||||||
} = search.execute().unwrap();
|
} = search.execute().unwrap();
|
||||||
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
|
let primary_key_id = index.fields_ids_map(&rtxn).unwrap().id("primary_key").unwrap();
|
||||||
documents_ids.sort_unstable();
|
documents_ids.sort_unstable();
|
||||||
|
@ -11,6 +11,7 @@ struct ScoreWithRatioResult {
|
|||||||
candidates: RoaringBitmap,
|
candidates: RoaringBitmap,
|
||||||
document_scores: Vec<(u32, ScoreWithRatio)>,
|
document_scores: Vec<(u32, ScoreWithRatio)>,
|
||||||
degraded: bool,
|
degraded: bool,
|
||||||
|
used_negative_operator: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
|
type ScoreWithRatio = (Vec<ScoreDetails>, f32);
|
||||||
@ -78,6 +79,7 @@ impl ScoreWithRatioResult {
|
|||||||
candidates: results.candidates,
|
candidates: results.candidates,
|
||||||
document_scores,
|
document_scores,
|
||||||
degraded: results.degraded,
|
degraded: results.degraded,
|
||||||
|
used_negative_operator: results.used_negative_operator,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -113,6 +115,7 @@ impl ScoreWithRatioResult {
|
|||||||
documents_ids,
|
documents_ids,
|
||||||
document_scores,
|
document_scores,
|
||||||
degraded: left.degraded | right.degraded,
|
degraded: left.degraded | right.degraded,
|
||||||
|
used_negative_operator: left.used_negative_operator | right.used_negative_operator,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -183,6 +183,7 @@ impl<'a> Search<'a> {
|
|||||||
documents_ids,
|
documents_ids,
|
||||||
document_scores,
|
document_scores,
|
||||||
degraded,
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
} = match self.vector.as_ref() {
|
} = match self.vector.as_ref() {
|
||||||
Some(vector) => execute_vector_search(
|
Some(vector) => execute_vector_search(
|
||||||
&mut ctx,
|
&mut ctx,
|
||||||
@ -221,7 +222,14 @@ impl<'a> Search<'a> {
|
|||||||
None => MatchingWords::default(),
|
None => MatchingWords::default(),
|
||||||
};
|
};
|
||||||
|
|
||||||
Ok(SearchResult { matching_words, candidates, document_scores, documents_ids, degraded })
|
Ok(SearchResult {
|
||||||
|
matching_words,
|
||||||
|
candidates,
|
||||||
|
document_scores,
|
||||||
|
documents_ids,
|
||||||
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -272,6 +280,7 @@ pub struct SearchResult {
|
|||||||
pub documents_ids: Vec<DocumentId>,
|
pub documents_ids: Vec<DocumentId>,
|
||||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||||
pub degraded: bool,
|
pub degraded: bool,
|
||||||
|
pub used_negative_operator: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||||
|
@ -240,6 +240,7 @@ pub(crate) mod tests {
|
|||||||
use super::super::super::located_query_terms_from_tokens;
|
use super::super::super::located_query_terms_from_tokens;
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::index::tests::TempIndex;
|
use crate::index::tests::TempIndex;
|
||||||
|
use crate::search::new::query_term::ExtractedTokens;
|
||||||
|
|
||||||
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
pub(crate) fn temp_index_with_documents() -> TempIndex {
|
||||||
let temp_index = TempIndex::new();
|
let temp_index = TempIndex::new();
|
||||||
@ -261,7 +262,8 @@ pub(crate) mod tests {
|
|||||||
let mut builder = TokenizerBuilder::default();
|
let mut builder = TokenizerBuilder::default();
|
||||||
let tokenizer = builder.build();
|
let tokenizer = builder.build();
|
||||||
let tokens = tokenizer.tokenize("split this world");
|
let tokens = tokenizer.tokenize("split this world");
|
||||||
let query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
let ExtractedTokens { query_terms, .. } =
|
||||||
|
located_query_terms_from_tokens(&mut ctx, tokens, None).unwrap();
|
||||||
let matching_words = MatchingWords::new(ctx, query_terms);
|
let matching_words = MatchingWords::new(ctx, query_terms);
|
||||||
|
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -33,7 +33,9 @@ use interner::{DedupInterner, Interner};
|
|||||||
pub use logger::visual::VisualSearchLogger;
|
pub use logger::visual::VisualSearchLogger;
|
||||||
pub use logger::{DefaultSearchLogger, SearchLogger};
|
pub use logger::{DefaultSearchLogger, SearchLogger};
|
||||||
use query_graph::{QueryGraph, QueryNode};
|
use query_graph::{QueryGraph, QueryNode};
|
||||||
use query_term::{located_query_terms_from_tokens, LocatedQueryTerm, Phrase, QueryTerm};
|
use query_term::{
|
||||||
|
located_query_terms_from_tokens, ExtractedTokens, LocatedQueryTerm, Phrase, QueryTerm,
|
||||||
|
};
|
||||||
use ranking_rules::{
|
use ranking_rules::{
|
||||||
BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
|
BoxRankingRule, PlaceholderQuery, RankingRule, RankingRuleOutput, RankingRuleQueryTrait,
|
||||||
};
|
};
|
||||||
@ -209,6 +211,35 @@ fn resolve_universe(
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "search")]
|
||||||
|
fn resolve_negative_words(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
negative_words: &[Word],
|
||||||
|
) -> Result<RoaringBitmap> {
|
||||||
|
let mut negative_bitmap = RoaringBitmap::new();
|
||||||
|
for &word in negative_words {
|
||||||
|
if let Some(bitmap) = ctx.word_docids(word)? {
|
||||||
|
negative_bitmap |= bitmap;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(negative_bitmap)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tracing::instrument(level = "trace", skip_all, target = "search")]
|
||||||
|
fn resolve_negative_phrases(
|
||||||
|
ctx: &mut SearchContext,
|
||||||
|
negative_phrases: &[LocatedQueryTerm],
|
||||||
|
) -> Result<RoaringBitmap> {
|
||||||
|
let mut negative_bitmap = RoaringBitmap::new();
|
||||||
|
for term in negative_phrases {
|
||||||
|
let query_term = ctx.term_interner.get(term.value);
|
||||||
|
if let Some(phrase) = query_term.original_phrase() {
|
||||||
|
negative_bitmap |= ctx.get_phrase_docids(phrase)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(negative_bitmap)
|
||||||
|
}
|
||||||
|
|
||||||
/// Return the list of initialised ranking rules to be used for a placeholder search.
|
/// Return the list of initialised ranking rules to be used for a placeholder search.
|
||||||
fn get_ranking_rules_for_placeholder_search<'ctx>(
|
fn get_ranking_rules_for_placeholder_search<'ctx>(
|
||||||
ctx: &SearchContext<'ctx>,
|
ctx: &SearchContext<'ctx>,
|
||||||
@ -557,6 +588,7 @@ pub fn execute_vector_search(
|
|||||||
documents_ids: docids,
|
documents_ids: docids,
|
||||||
located_query_terms: None,
|
located_query_terms: None,
|
||||||
degraded,
|
degraded,
|
||||||
|
used_negative_operator: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -580,6 +612,7 @@ pub fn execute_search(
|
|||||||
) -> Result<PartialSearchResult> {
|
) -> Result<PartialSearchResult> {
|
||||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||||
|
|
||||||
|
let mut used_negative_operator = false;
|
||||||
let mut located_query_terms = None;
|
let mut located_query_terms = None;
|
||||||
let query_terms = if let Some(query) = query {
|
let query_terms = if let Some(query) = query {
|
||||||
let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
|
let span = tracing::trace_span!(target: "search::tokens", "tokenizer_builder");
|
||||||
@ -620,7 +653,16 @@ pub fn execute_search(
|
|||||||
let tokens = tokenizer.tokenize(query);
|
let tokens = tokenizer.tokenize(query);
|
||||||
drop(entered);
|
drop(entered);
|
||||||
|
|
||||||
let query_terms = located_query_terms_from_tokens(ctx, tokens, words_limit)?;
|
let ExtractedTokens { query_terms, negative_words, negative_phrases } =
|
||||||
|
located_query_terms_from_tokens(ctx, tokens, words_limit)?;
|
||||||
|
used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty();
|
||||||
|
|
||||||
|
let ignored_documents = resolve_negative_words(ctx, &negative_words)?;
|
||||||
|
let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?;
|
||||||
|
|
||||||
|
universe -= ignored_documents;
|
||||||
|
universe -= ignored_phrases;
|
||||||
|
|
||||||
if query_terms.is_empty() {
|
if query_terms.is_empty() {
|
||||||
// Do a placeholder search instead
|
// Do a placeholder search instead
|
||||||
None
|
None
|
||||||
@ -630,6 +672,7 @@ pub fn execute_search(
|
|||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
};
|
};
|
||||||
|
|
||||||
let bucket_sort_output = if let Some(query_terms) = query_terms {
|
let bucket_sort_output = if let Some(query_terms) = query_terms {
|
||||||
let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
|
let (graph, new_located_query_terms) = QueryGraph::from_query(ctx, &query_terms)?;
|
||||||
located_query_terms = Some(new_located_query_terms);
|
located_query_terms = Some(new_located_query_terms);
|
||||||
@ -690,6 +733,7 @@ pub fn execute_search(
|
|||||||
documents_ids: docids,
|
documents_ids: docids,
|
||||||
located_query_terms,
|
located_query_terms,
|
||||||
degraded,
|
degraded,
|
||||||
|
used_negative_operator,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -752,4 +796,5 @@ pub struct PartialSearchResult {
|
|||||||
pub document_scores: Vec<Vec<ScoreDetails>>,
|
pub document_scores: Vec<Vec<ScoreDetails>>,
|
||||||
|
|
||||||
pub degraded: bool,
|
pub degraded: bool,
|
||||||
|
pub used_negative_operator: bool,
|
||||||
}
|
}
|
||||||
|
@ -9,7 +9,9 @@ use std::ops::RangeInclusive;
|
|||||||
|
|
||||||
use either::Either;
|
use either::Either;
|
||||||
pub use ntypo_subset::NTypoTermSubset;
|
pub use ntypo_subset::NTypoTermSubset;
|
||||||
pub use parse_query::{located_query_terms_from_tokens, make_ngram, number_of_typos_allowed};
|
pub use parse_query::{
|
||||||
|
located_query_terms_from_tokens, make_ngram, number_of_typos_allowed, ExtractedTokens,
|
||||||
|
};
|
||||||
pub use phrase::Phrase;
|
pub use phrase::Phrase;
|
||||||
|
|
||||||
use super::interner::{DedupInterner, Interned};
|
use super::interner::{DedupInterner, Interned};
|
||||||
@ -478,6 +480,11 @@ impl QueryTerm {
|
|||||||
pub fn original_word(&self, ctx: &SearchContext) -> String {
|
pub fn original_word(&self, ctx: &SearchContext) -> String {
|
||||||
ctx.word_interner.get(self.original).clone()
|
ctx.word_interner.get(self.original).clone()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn original_phrase(&self) -> Option<Interned<Phrase>> {
|
||||||
|
self.zero_typo.phrase
|
||||||
|
}
|
||||||
|
|
||||||
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
|
pub fn all_computed_derivations(&self) -> (Vec<Interned<String>>, Vec<Interned<Phrase>>) {
|
||||||
let mut words = BTreeSet::new();
|
let mut words = BTreeSet::new();
|
||||||
let mut phrases = BTreeSet::new();
|
let mut phrases = BTreeSet::new();
|
||||||
|
@ -6,20 +6,37 @@ use charabia::{SeparatorKind, TokenKind};
|
|||||||
use super::compute_derivations::partially_initialized_term_from_word;
|
use super::compute_derivations::partially_initialized_term_from_word;
|
||||||
use super::{LocatedQueryTerm, ZeroTypoTerm};
|
use super::{LocatedQueryTerm, ZeroTypoTerm};
|
||||||
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
|
use crate::search::new::query_term::{Lazy, Phrase, QueryTerm};
|
||||||
|
use crate::search::new::Word;
|
||||||
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
use crate::{Result, SearchContext, MAX_WORD_LENGTH};
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
/// Extraction of the content of a query.
|
||||||
|
pub struct ExtractedTokens {
|
||||||
|
/// The terms to search for in the database.
|
||||||
|
pub query_terms: Vec<LocatedQueryTerm>,
|
||||||
|
/// The words that must not appear in the results.
|
||||||
|
pub negative_words: Vec<Word>,
|
||||||
|
/// The phrases that must not appear in the results.
|
||||||
|
pub negative_phrases: Vec<LocatedQueryTerm>,
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert the tokenised search query into a list of located query terms.
|
/// Convert the tokenised search query into a list of located query terms.
|
||||||
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
#[tracing::instrument(level = "trace", skip_all, target = "search::query")]
|
||||||
pub fn located_query_terms_from_tokens(
|
pub fn located_query_terms_from_tokens(
|
||||||
ctx: &mut SearchContext,
|
ctx: &mut SearchContext,
|
||||||
query: NormalizedTokenIter,
|
query: NormalizedTokenIter,
|
||||||
words_limit: Option<usize>,
|
words_limit: Option<usize>,
|
||||||
) -> Result<Vec<LocatedQueryTerm>> {
|
) -> Result<ExtractedTokens> {
|
||||||
let nbr_typos = number_of_typos_allowed(ctx)?;
|
let nbr_typos = number_of_typos_allowed(ctx)?;
|
||||||
|
|
||||||
let mut located_terms = Vec::new();
|
let mut query_terms = Vec::new();
|
||||||
|
|
||||||
|
let mut negative_phrase = false;
|
||||||
let mut phrase: Option<PhraseBuilder> = None;
|
let mut phrase: Option<PhraseBuilder> = None;
|
||||||
|
let mut encountered_whitespace = true;
|
||||||
|
let mut negative_next_token = false;
|
||||||
|
let mut negative_words = Vec::new();
|
||||||
|
let mut negative_phrases = Vec::new();
|
||||||
|
|
||||||
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
let parts_limit = words_limit.unwrap_or(usize::MAX);
|
||||||
|
|
||||||
@ -31,9 +48,10 @@ pub fn located_query_terms_from_tokens(
|
|||||||
if token.lemma().is_empty() {
|
if token.lemma().is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// early return if word limit is exceeded
|
// early return if word limit is exceeded
|
||||||
if located_terms.len() >= parts_limit {
|
if query_terms.len() >= parts_limit {
|
||||||
return Ok(located_terms);
|
return Ok(ExtractedTokens { query_terms, negative_words, negative_phrases });
|
||||||
}
|
}
|
||||||
|
|
||||||
match token.kind {
|
match token.kind {
|
||||||
@ -46,6 +64,11 @@ pub fn located_query_terms_from_tokens(
|
|||||||
// 3. if the word is the last token of the query we push it as a prefix word.
|
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||||
if let Some(phrase) = &mut phrase {
|
if let Some(phrase) = &mut phrase {
|
||||||
phrase.push_word(ctx, &token, position)
|
phrase.push_word(ctx, &token, position)
|
||||||
|
} else if negative_next_token {
|
||||||
|
let word = token.lemma().to_string();
|
||||||
|
let word = Word::Original(ctx.word_interner.insert(word));
|
||||||
|
negative_words.push(word);
|
||||||
|
negative_next_token = false;
|
||||||
} else if peekable.peek().is_some() {
|
} else if peekable.peek().is_some() {
|
||||||
match token.kind {
|
match token.kind {
|
||||||
TokenKind::Word => {
|
TokenKind::Word => {
|
||||||
@ -61,9 +84,9 @@ pub fn located_query_terms_from_tokens(
|
|||||||
value: ctx.term_interner.push(term),
|
value: ctx.term_interner.push(term),
|
||||||
positions: position..=position,
|
positions: position..=position,
|
||||||
};
|
};
|
||||||
located_terms.push(located_term);
|
query_terms.push(located_term);
|
||||||
}
|
}
|
||||||
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => {}
|
TokenKind::StopWord | TokenKind::Separator(_) | TokenKind::Unknown => (),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let word = token.lemma();
|
let word = token.lemma();
|
||||||
@ -78,7 +101,7 @@ pub fn located_query_terms_from_tokens(
|
|||||||
value: ctx.term_interner.push(term),
|
value: ctx.term_interner.push(term),
|
||||||
positions: position..=position,
|
positions: position..=position,
|
||||||
};
|
};
|
||||||
located_terms.push(located_term);
|
query_terms.push(located_term);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
TokenKind::Separator(separator_kind) => {
|
TokenKind::Separator(separator_kind) => {
|
||||||
@ -94,7 +117,14 @@ pub fn located_query_terms_from_tokens(
|
|||||||
let phrase = if separator_kind == SeparatorKind::Hard {
|
let phrase = if separator_kind == SeparatorKind::Hard {
|
||||||
if let Some(phrase) = phrase {
|
if let Some(phrase) = phrase {
|
||||||
if let Some(located_query_term) = phrase.build(ctx) {
|
if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
located_terms.push(located_query_term)
|
// as we are evaluating a negative operator we put the phrase
|
||||||
|
// in the negative one *but* we don't reset the negative operator
|
||||||
|
// as we are immediatly starting a new negative phrase.
|
||||||
|
if negative_phrase {
|
||||||
|
negative_phrases.push(located_query_term);
|
||||||
|
} else {
|
||||||
|
query_terms.push(located_query_term);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
Some(PhraseBuilder::empty())
|
Some(PhraseBuilder::empty())
|
||||||
} else {
|
} else {
|
||||||
@ -115,26 +145,49 @@ pub fn located_query_terms_from_tokens(
|
|||||||
// Per the check above, quote_count > 0
|
// Per the check above, quote_count > 0
|
||||||
quote_count -= 1;
|
quote_count -= 1;
|
||||||
if let Some(located_query_term) = phrase.build(ctx) {
|
if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
located_terms.push(located_query_term)
|
// we were evaluating a negative operator so we
|
||||||
|
// put the phrase in the negative phrases
|
||||||
|
if negative_phrase {
|
||||||
|
negative_phrases.push(located_query_term);
|
||||||
|
negative_phrase = false;
|
||||||
|
} else {
|
||||||
|
query_terms.push(located_query_term);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Start new phrase if the token ends with an opening quote
|
// Start new phrase if the token ends with an opening quote
|
||||||
(quote_count % 2 == 1).then_some(PhraseBuilder::empty())
|
if quote_count % 2 == 1 {
|
||||||
|
negative_phrase = negative_next_token;
|
||||||
|
Some(PhraseBuilder::empty())
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
negative_next_token =
|
||||||
|
phrase.is_none() && token.lemma() == "-" && encountered_whitespace;
|
||||||
}
|
}
|
||||||
_ => (),
|
_ => (),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
encountered_whitespace =
|
||||||
|
token.lemma().chars().last().filter(|c| c.is_whitespace()).is_some();
|
||||||
}
|
}
|
||||||
|
|
||||||
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
// If a quote is never closed, we consider all of the end of the query as a phrase.
|
||||||
if let Some(phrase) = phrase.take() {
|
if let Some(phrase) = phrase.take() {
|
||||||
if let Some(located_query_term) = phrase.build(ctx) {
|
if let Some(located_query_term) = phrase.build(ctx) {
|
||||||
located_terms.push(located_query_term);
|
// put the phrase in the negative set if we are evaluating a negative operator.
|
||||||
|
if negative_phrase {
|
||||||
|
negative_phrases.push(located_query_term);
|
||||||
|
} else {
|
||||||
|
query_terms.push(located_query_term);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(located_terms)
|
Ok(ExtractedTokens { query_terms, negative_words, negative_phrases })
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn number_of_typos_allowed<'ctx>(
|
pub fn number_of_typos_allowed<'ctx>(
|
||||||
@ -315,8 +368,10 @@ mod tests {
|
|||||||
let rtxn = index.read_txn()?;
|
let rtxn = index.read_txn()?;
|
||||||
let mut ctx = SearchContext::new(&index, &rtxn);
|
let mut ctx = SearchContext::new(&index, &rtxn);
|
||||||
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
|
// panics with `attempt to add with overflow` before <https://github.com/meilisearch/meilisearch/issues/3785>
|
||||||
let located_query_terms = located_query_terms_from_tokens(&mut ctx, tokens, None)?;
|
let ExtractedTokens { query_terms, .. } =
|
||||||
assert!(located_query_terms.is_empty());
|
located_query_terms_from_tokens(&mut ctx, tokens, None)?;
|
||||||
|
assert!(query_terms.is_empty());
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user