mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-11 05:54:30 +01:00
commit
c2df51aa95
@ -91,7 +91,14 @@ impl<'a> Search<'a> {
|
|||||||
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
|
||||||
builder.optional_words(self.optional_words);
|
builder.optional_words(self.optional_words);
|
||||||
builder.authorize_typos(self.authorize_typos);
|
builder.authorize_typos(self.authorize_typos);
|
||||||
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default());
|
// We make sure that the analyzer is aware of the stop words
|
||||||
|
// this ensures that the query builder is able to properly remove them.
|
||||||
|
let mut config = AnalyzerConfig::default();
|
||||||
|
let stop_words = self.index.stop_words(self.rtxn)?;
|
||||||
|
if let Some(ref stop_words) = stop_words {
|
||||||
|
config.stop_words(stop_words);
|
||||||
|
}
|
||||||
|
let analyzer = Analyzer::new(config);
|
||||||
let result = analyzer.analyze(query);
|
let result = analyzer.analyze(query);
|
||||||
let tokens = result.tokens();
|
let tokens = result.tokens();
|
||||||
builder.build(tokens)?
|
builder.build(tokens)?
|
||||||
|
@ -155,10 +155,6 @@ impl fmt::Debug for Query {
|
|||||||
|
|
||||||
trait Context {
|
trait Context {
|
||||||
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
fn word_docids(&self, word: &str) -> heed::Result<Option<RoaringBitmap>>;
|
||||||
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>>;
|
|
||||||
fn is_stop_word(&self, word: &str) -> anyhow::Result<bool> {
|
|
||||||
Ok(self.stop_words()?.map_or(false, |s| s.contains(word)))
|
|
||||||
}
|
|
||||||
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
fn synonyms<S: AsRef<str>>(&self, words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>>;
|
||||||
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
fn word_documents_count(&self, word: &str) -> heed::Result<Option<u64>> {
|
||||||
match self.word_docids(word)? {
|
match self.word_docids(word)? {
|
||||||
@ -188,10 +184,6 @@ impl<'a> Context for QueryTreeBuilder<'a> {
|
|||||||
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
fn synonyms<S: AsRef<str>>(&self, _words: &[S]) -> heed::Result<Option<Vec<Vec<String>>>> {
|
||||||
Ok(None)
|
Ok(None)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
|
||||||
self.index.stop_words(self.rtxn)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> QueryTreeBuilder<'a> {
|
impl<'a> QueryTreeBuilder<'a> {
|
||||||
@ -229,7 +221,8 @@ impl<'a> QueryTreeBuilder<'a> {
|
|||||||
/// forcing all query words to match documents without any typo
|
/// forcing all query words to match documents without any typo
|
||||||
/// (the criterion `typo` will be ignored)
|
/// (the criterion `typo` will be ignored)
|
||||||
pub fn build(&self, query: TokenStream) -> anyhow::Result<Option<Operation>> {
|
pub fn build(&self, query: TokenStream) -> anyhow::Result<Option<Operation>> {
|
||||||
let primitive_query = create_primitive_query(query);
|
let stop_words = self.index.stop_words(self.rtxn)?;
|
||||||
|
let primitive_query = create_primitive_query(query, stop_words);
|
||||||
if !primitive_query.is_empty() {
|
if !primitive_query.is_empty() {
|
||||||
create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some)
|
create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some)
|
||||||
} else {
|
} else {
|
||||||
@ -340,7 +333,8 @@ fn create_query_tree(
|
|||||||
optional_words: bool,
|
optional_words: bool,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: PrimitiveQuery,
|
query: PrimitiveQuery,
|
||||||
) -> anyhow::Result<Operation> {
|
) -> anyhow::Result<Operation>
|
||||||
|
{
|
||||||
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
/// Matches on the `PrimitiveQueryPart` and create an operation from it.
|
||||||
fn resolve_primitive_part(
|
fn resolve_primitive_part(
|
||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
@ -358,12 +352,7 @@ fn create_query_tree(
|
|||||||
if let Some(child) = split_best_frequency(ctx, &word)? {
|
if let Some(child) = split_best_frequency(ctx, &word)? {
|
||||||
children.push(child);
|
children.push(child);
|
||||||
}
|
}
|
||||||
|
children.push(Operation::Query(Query { prefix, kind: typos(word, authorize_typos) }));
|
||||||
let is_stop_word = ctx.is_stop_word(&word)?;
|
|
||||||
let query = Query { prefix, kind: typos(word, authorize_typos) };
|
|
||||||
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
|
||||||
children.push(Operation::Query(query));
|
|
||||||
}
|
|
||||||
Ok(Operation::or(false, children))
|
Ok(Operation::or(false, children))
|
||||||
},
|
},
|
||||||
// create a CONSECUTIVE operation wrapping all word in the phrase
|
// create a CONSECUTIVE operation wrapping all word in the phrase
|
||||||
@ -378,7 +367,8 @@ fn create_query_tree(
|
|||||||
ctx: &impl Context,
|
ctx: &impl Context,
|
||||||
authorize_typos: bool,
|
authorize_typos: bool,
|
||||||
query: &[PrimitiveQueryPart],
|
query: &[PrimitiveQueryPart],
|
||||||
) -> anyhow::Result<Operation> {
|
) -> anyhow::Result<Operation>
|
||||||
|
{
|
||||||
const MAX_NGRAM: usize = 3;
|
const MAX_NGRAM: usize = 3;
|
||||||
let mut op_children = Vec::new();
|
let mut op_children = Vec::new();
|
||||||
|
|
||||||
@ -393,33 +383,25 @@ fn create_query_tree(
|
|||||||
|
|
||||||
match group {
|
match group {
|
||||||
[part] => {
|
[part] => {
|
||||||
let operation =
|
let operation = resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
||||||
resolve_primitive_part(ctx, authorize_typos, part.clone())?;
|
|
||||||
and_op_children.push(operation);
|
and_op_children.push(operation);
|
||||||
}
|
},
|
||||||
words => {
|
words => {
|
||||||
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
|
let is_prefix = words.last().map_or(false, |part| part.is_prefix());
|
||||||
let words: Vec<_> = words
|
let words: Vec<_> = words.iter().filter_map(|part| {
|
||||||
.iter()
|
|
||||||
.filter_map(|part| {
|
|
||||||
if let PrimitiveQueryPart::Word(word, _) = part {
|
if let PrimitiveQueryPart::Word(word, _) = part {
|
||||||
Some(word.as_str())
|
Some(word.as_str())
|
||||||
} else {
|
} else {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
})
|
}).collect();
|
||||||
.collect();
|
|
||||||
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
let mut operations = synonyms(ctx, &words)?.unwrap_or_default();
|
||||||
let concat = words.concat();
|
let concat = words.concat();
|
||||||
|
|
||||||
let is_stop_word = ctx.is_stop_word(&concat)?;
|
|
||||||
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
let query = Query { prefix: is_prefix, kind: typos(concat, authorize_typos) };
|
||||||
if query.prefix || query.kind.is_tolerant() || !is_stop_word {
|
|
||||||
operations.push(Operation::Query(query));
|
operations.push(Operation::Query(query));
|
||||||
and_op_children.push(Operation::or(false, operations));
|
and_op_children.push(Operation::or(false, operations));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
if !is_last {
|
if !is_last {
|
||||||
let ngrams = ngrams(ctx, authorize_typos, tail)?;
|
let ngrams = ngrams(ctx, authorize_typos, tail)?;
|
||||||
@ -494,7 +476,7 @@ impl PrimitiveQueryPart {
|
|||||||
|
|
||||||
/// Create primitive query from tokenized query string,
|
/// Create primitive query from tokenized query string,
|
||||||
/// the primitive query is an intermediate state to build the query tree.
|
/// the primitive query is an intermediate state to build the query tree.
|
||||||
fn create_primitive_query(query: TokenStream) -> PrimitiveQuery {
|
fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>) -> PrimitiveQuery {
|
||||||
let mut primitive_query = Vec::new();
|
let mut primitive_query = Vec::new();
|
||||||
let mut phrase = Vec::new();
|
let mut phrase = Vec::new();
|
||||||
let mut quoted = false;
|
let mut quoted = false;
|
||||||
@ -502,14 +484,16 @@ fn create_primitive_query(query: TokenStream) -> PrimitiveQuery {
|
|||||||
let mut peekable = query.peekable();
|
let mut peekable = query.peekable();
|
||||||
while let Some(token) = peekable.next() {
|
while let Some(token) = peekable.next() {
|
||||||
match token.kind {
|
match token.kind {
|
||||||
TokenKind::Word => {
|
TokenKind::Word | TokenKind::StopWord => {
|
||||||
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
|
||||||
// 2. if the word is not the last token of the query we push it as a non-prefix word,
|
// 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
|
||||||
// 3. if the word is the last token of the query we push it as a prefix word.
|
// 3. if the word is the last token of the query we push it as a prefix word.
|
||||||
if quoted {
|
if quoted {
|
||||||
phrase.push(token.word.to_string());
|
phrase.push(token.word.to_string());
|
||||||
} else if peekable.peek().is_some() {
|
} else if peekable.peek().is_some() {
|
||||||
|
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) {
|
||||||
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
|
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true));
|
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true));
|
||||||
}
|
}
|
||||||
@ -583,7 +567,7 @@ mod test {
|
|||||||
query: TokenStream,
|
query: TokenStream,
|
||||||
) -> anyhow::Result<Option<Operation>>
|
) -> anyhow::Result<Option<Operation>>
|
||||||
{
|
{
|
||||||
let primitive_query = create_primitive_query(query);
|
let primitive_query = create_primitive_query(query, None);
|
||||||
if !primitive_query.is_empty() {
|
if !primitive_query.is_empty() {
|
||||||
create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some)
|
create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some)
|
||||||
} else {
|
} else {
|
||||||
@ -601,10 +585,6 @@ mod test {
|
|||||||
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
let words: Vec<_> = words.iter().map(|s| s.as_ref().to_owned()).collect();
|
||||||
Ok(self.synonyms.get(&words).cloned())
|
Ok(self.synonyms.get(&words).cloned())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn stop_words(&self) -> anyhow::Result<Option<Set<&[u8]>>> {
|
|
||||||
Ok(None)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for TestContext {
|
impl Default for TestContext {
|
||||||
|
@ -602,12 +602,13 @@ mod tests {
|
|||||||
assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes());
|
assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes());
|
||||||
|
|
||||||
// when we search for something that is a non prefix stop_words it should be ignored
|
// when we search for something that is a non prefix stop_words it should be ignored
|
||||||
|
// thus we should get a placeholder search (all the results = 3)
|
||||||
let result = index.search(&rtxn).query("the ").execute().unwrap();
|
let result = index.search(&rtxn).query("the ").execute().unwrap();
|
||||||
assert!(result.documents_ids.is_empty());
|
assert_eq!(result.documents_ids.len(), 3);
|
||||||
let result = index.search(&rtxn).query("i ").execute().unwrap();
|
let result = index.search(&rtxn).query("i ").execute().unwrap();
|
||||||
assert!(result.documents_ids.is_empty());
|
assert_eq!(result.documents_ids.len(), 3);
|
||||||
let result = index.search(&rtxn).query("are ").execute().unwrap();
|
let result = index.search(&rtxn).query("are ").execute().unwrap();
|
||||||
assert!(result.documents_ids.is_empty());
|
assert_eq!(result.documents_ids.len(), 3);
|
||||||
|
|
||||||
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
let result = index.search(&rtxn).query("dog").execute().unwrap();
|
||||||
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos
|
||||||
|
Loading…
x
Reference in New Issue
Block a user