test a new implementation of the stop_words

This commit is contained in:
tamo 2021-04-08 21:21:20 +02:00
parent da036dcc3e
commit dcb00b2e54
No known key found for this signature in database
GPG Key ID: 20CD8020AFA88D69
3 changed files with 25 additions and 13 deletions

View File

@ -91,7 +91,14 @@ impl<'a> Search<'a> {
let mut builder = QueryTreeBuilder::new(self.rtxn, self.index); let mut builder = QueryTreeBuilder::new(self.rtxn, self.index);
builder.optional_words(self.optional_words); builder.optional_words(self.optional_words);
builder.authorize_typos(self.authorize_typos); builder.authorize_typos(self.authorize_typos);
let analyzer = Analyzer::<Vec<u8>>::new(AnalyzerConfig::default()); // We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them.
let mut config = AnalyzerConfig::default();
let stop_words = self.index.stop_words(self.rtxn)?;
if let Some(ref stop_words) = stop_words {
config.stop_words(stop_words);
}
let analyzer = Analyzer::new(config);
let result = analyzer.analyze(query); let result = analyzer.analyze(query);
let tokens = result.tokens(); let tokens = result.tokens();
builder.build(tokens)? builder.build(tokens)?

View File

@ -1,6 +1,7 @@
use std::collections::HashSet; use std::collections::HashSet;
use std::{fmt, cmp, mem}; use std::{fmt, cmp, mem};
use fst::Set;
use levenshtein_automata::{DFA, Distance}; use levenshtein_automata::{DFA, Distance};
use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream}; use meilisearch_tokenizer::{TokenKind, tokenizer::TokenStream};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -220,7 +221,8 @@ impl<'a> QueryTreeBuilder<'a> {
/// forcing all query words to match documents without any typo /// forcing all query words to match documents without any typo
/// (the criterion `typo` will be ignored) /// (the criterion `typo` will be ignored)
pub fn build(&self, query: TokenStream) -> anyhow::Result<Option<Operation>> { pub fn build(&self, query: TokenStream) -> anyhow::Result<Option<Operation>> {
let primitive_query = create_primitive_query(query); let stop_words = self.index.stop_words(self.rtxn)?;
let primitive_query = create_primitive_query(query, stop_words);
if !primitive_query.is_empty() { if !primitive_query.is_empty() {
create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some) create_query_tree(self, self.optional_words, self.authorize_typos, primitive_query).map(Some)
} else { } else {
@ -385,7 +387,7 @@ fn create_query_tree(
and_op_children.push(operation); and_op_children.push(operation);
}, },
words => { words => {
let is_prefix = words.last().map(|part| part.is_prefix()).unwrap_or(false); let is_prefix = words.last().map_or(false, |part| part.is_prefix());
let words: Vec<_> = words.iter().filter_map(|part| { let words: Vec<_> = words.iter().filter_map(|part| {
if let PrimitiveQueryPart::Word(word, _) = part { if let PrimitiveQueryPart::Word(word, _) = part {
Some(word.as_str()) Some(word.as_str())
@ -474,7 +476,7 @@ impl PrimitiveQueryPart {
/// Create primitive query from tokenized query string, /// Create primitive query from tokenized query string,
/// the primitive query is an intermediate state to build the query tree. /// the primitive query is an intermediate state to build the query tree.
fn create_primitive_query(query: TokenStream) -> PrimitiveQuery { fn create_primitive_query(query: TokenStream, stop_words: Option<Set<&[u8]>>) -> PrimitiveQuery {
let mut primitive_query = Vec::new(); let mut primitive_query = Vec::new();
let mut phrase = Vec::new(); let mut phrase = Vec::new();
let mut quoted = false; let mut quoted = false;
@ -482,14 +484,16 @@ fn create_primitive_query(query: TokenStream) -> PrimitiveQuery {
let mut peekable = query.peekable(); let mut peekable = query.peekable();
while let Some(token) = peekable.next() { while let Some(token) = peekable.next() {
match token.kind { match token.kind {
TokenKind::Word => { TokenKind::Word | TokenKind::StopWord => {
// 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote, // 1. if the word is quoted we push it in a phrase-buffer waiting for the ending quote,
// 2. if the word is not the last token of the query we push it as a non-prefix word, // 2. if the word is not the last token of the query and is not a stop_word we push it as a non-prefix word,
// 3. if the word is the last token of the query we push it as a prefix word. // 3. if the word is the last token of the query we push it as a prefix word.
if quoted { if quoted {
phrase.push(token.word.to_string()); phrase.push(token.word.to_string());
} else if peekable.peek().is_some() { } else if peekable.peek().is_some() {
if !stop_words.as_ref().map_or(false, |swords| swords.contains(token.word.as_ref())) {
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false)); primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), false));
}
} else { } else {
primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true)); primitive_query.push(PrimitiveQueryPart::Word(token.word.to_string(), true));
} }
@ -563,7 +567,7 @@ mod test {
query: TokenStream, query: TokenStream,
) -> anyhow::Result<Option<Operation>> ) -> anyhow::Result<Option<Operation>>
{ {
let primitive_query = create_primitive_query(query); let primitive_query = create_primitive_query(query, None);
if !primitive_query.is_empty() { if !primitive_query.is_empty() {
create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some) create_query_tree(self, optional_words, authorize_typos, primitive_query).map(Some)
} else { } else {

View File

@ -602,12 +602,13 @@ mod tests {
assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes()); assert_eq!(stop_words.as_fst().as_bytes(), expected.as_fst().as_bytes());
// when we search for something that is a non prefix stop_words it should be ignored // when we search for something that is a non prefix stop_words it should be ignored
// thus we should get a placeholder search (all the results = 3)
let result = index.search(&rtxn).query("the ").execute().unwrap(); let result = index.search(&rtxn).query("the ").execute().unwrap();
assert!(result.documents_ids.is_empty()); assert_eq!(result.documents_ids.len(), 3);
let result = index.search(&rtxn).query("i ").execute().unwrap(); let result = index.search(&rtxn).query("i ").execute().unwrap();
assert!(result.documents_ids.is_empty()); assert_eq!(result.documents_ids.len(), 3);
let result = index.search(&rtxn).query("are ").execute().unwrap(); let result = index.search(&rtxn).query("are ").execute().unwrap();
assert!(result.documents_ids.is_empty()); assert_eq!(result.documents_ids.len(), 3);
let result = index.search(&rtxn).query("dog").execute().unwrap(); let result = index.search(&rtxn).query("dog").execute().unwrap();
assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos assert_eq!(result.documents_ids.len(), 2); // we have two maxims talking about doggos