mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Integrate the stop_words in the querytree
remove the stop_words from the querytree except if it was a prefix or a typo
This commit is contained in:
parent
a2f46029c7
commit
12fb509d84
@ -1,6 +1,7 @@
|
|||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
use std::{fmt, cmp, mem};
|
use std::{fmt, cmp, mem};
|
||||||
|
|
||||||
|
use fst::Set;
|
||||||
use levenshtein_automata::{DFA, Distance};
|
use levenshtein_automata::{DFA, Distance};
|
||||||
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
@ -154,6 +155,10 @@ impl fmt::Debug for Query {
|
|||||||
|
|
||||||
trait Context {
|
trait Context {
|
||||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
|
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>>;
|
||||||
|
fn is_stop_word(&self, word: &str) -> anyhow::Result<bool> {
|
||||||
|
Ok(self.stop_words()?.map_or(false, |s| s.contains(word)))
|
||||||
|
}
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
||||||
match self.word_docids(word)? {
|
match self.word_docids(word)? {
|
||||||
@ -183,6 +188,10 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
||||||
|
self.index.stop_words(self.rtxn)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> QueryTreeBuilder<'a> {
|
impl<'a> QueryTreeBuilder<'a> {
|
||||||
@ -331,8 +340,7 @@ fn create_query_tree(
|
|||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: PrimitiveQuery,
|
query: PrimitiveQuery,
|
||||||
) -> anyhow::Result<Operation>
|
) -> anyhow::Result<Operation> {
|
||||||
{
|
|
||||||
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
||||||
fn resolve_primitive_part(
|
fn resolve_primitive_part(
|
||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
@ -350,7 +358,12 @@ fn create_query_tree(
|
|||||||
if let Some(child) = split_best_frequency(ctx, &word)? {
|
if let Some(child) = split_best_frequency(ctx, &word)? {
|
||||||
children.push(child);
|
children.push(child);
|
||||||
}
|
}
|
||||||
children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
|
|
||||||
|
let is_stop_word = ctx.is_stop_word(&word)?;
|
||||||
|
let query = Query { prefix, kind: typos(word, authorize_typos) };
|
||||||
|
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
||||||
|
children.push(Operation::Query(query));
|
||||||
|
}
|
||||||
Ok(Operation::or(false, children))
|
Ok(Operation::or(false, children))
|
||||||
},
|
},
|
||||||
// create a CONSECUTIVE operation wrapping all word in the phrase
|
// create a CONSECUTIVE operation wrapping all word in the phrase
|
||||||
@ -365,12 +378,11 @@ fn create_query_tree(
|
|||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: &[PrimitiveQueryPart],
|
query: &[PrimitiveQueryPart],
|
||||||
) -> anyhow::Result<Operation>
|
) -> anyhow::Result<Operation> {
|
||||||
{
|
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
let mut op_children = Vec::new();
|
let mut op_children = Vec::new();
|
||||||
|
|
||||||
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase()) ) {
|
for sub_query in query.linear_group_by(|a, b| !(a.is_phrase() || b.is_phrase())) {
|
||||||
let mut or_op_children = Vec::new();
|
let mut or_op_children = Vec::new();
|
||||||
|
|
||||||
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
for ngram in 1..=MAX_NGRAM.min(sub_query.len()) {
|
||||||
@ -381,25 +393,33 @@ fn create_query_tree(
|
|||||||
|
|
||||||
match group {
|
match group {
|
||||||
[part] => {
|
[part] => {
|
||||||
let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
let operation =
|
||||||
|
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
||||||
and_op_children.push(operation);
|
and_op_children.push(operation);
|
||||||
},
|
}
|
||||||
words => {
|
words => {
|
||||||
let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false);
|
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
|
||||||
let words: Vec<_> = words.iter().filter_map(| part| {
|
let words: Vec<_> = words
|
||||||
|
.iter()
|
||||||
|
.filter_map(|part| {
|
||||||
if let PrimitiveQueryPart::Word(word, _) = part {
|
if let PrimitiveQueryPart::Word(word, _) = part {
|
||||||
Some(word.as_str())
|
Some(word.as_str())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
}).collect();
|
})
|
||||||
|
.collect();
|
||||||
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
||||||
let concat = words.concat();
|
let concat = words.concat();
|
||||||
|
|
||||||
|
let is_stop_word = ctx.is_stop_word(&concat)?;
|
||||||
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
||||||
|
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
||||||
operations.push(Operation::Query(query));
|
operations.push(Operation::Query(query));
|
||||||
and_op_children.push(Operation::or(false, operations));
|
and_op_children.push(Operation::or(false, operations));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if !is_last {
|
if !is_last {
|
||||||
let ngrams = ngrams(ctx, authorize_typos, tail)?;
|
let ngrams = ngrams(ctx, authorize_typos, tail)?;
|
||||||
@ -581,6 +601,10 @@ mod test {
|
|||||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||||
Ok(self.synonyms.get(&words).cloned())
|
Ok(self.synonyms.get(&words).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
||||||
|
Ok(None)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TestContext {
|
impl Default for TestContext {
|
||||||
|
Loading…
Reference in New Issue
Block a user