mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Add tests for stop words and fix a couple of bugs
This commit is contained in:
parent
dd007dceca
commit
374095d42c
@ -333,7 +333,8 @@ pub fn execute_search(
|
|||||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||||
|
|
||||||
let mut located_query_terms = None;
|
let mut located_query_terms = None;
|
||||||
let bucket_sort_output = if let Some(query) = query {
|
|
||||||
|
let query_terms = if let Some(query) = query {
|
||||||
// We make sure that the analyzer is aware of the stop words
|
// We make sure that the analyzer is aware of the stop words
|
||||||
// this ensures that the query builder is able to properly remove them.
|
// this ensures that the query builder is able to properly remove them.
|
||||||
let mut tokbuilder = TokenizerBuilder::new();
|
let mut tokbuilder = TokenizerBuilder::new();
|
||||||
@ -351,6 +352,16 @@ pub fn execute_search(
|
|||||||
let tokens = tokenizer.tokenize(query);
|
let tokens = tokenizer.tokenize(query);
|
||||||
|
|
||||||
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
|
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
|
||||||
|
if query_terms.is_empty() {
|
||||||
|
// Do a placeholder search instead
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(query_terms)
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
let bucket_sort_output = if let Some(query_terms) = query_terms {
|
||||||
let graph = QueryGraph::from_query(ctx, &query_terms)?;
|
let graph = QueryGraph::from_query(ctx, &query_terms)?;
|
||||||
located_query_terms = Some(query_terms);
|
located_query_terms = Some(query_terms);
|
||||||
|
|
||||||
|
@ -241,7 +241,7 @@ impl PhraseBuilder {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn is_empty(&self) -> bool {
|
fn is_empty(&self) -> bool {
|
||||||
self.words.is_empty()
|
self.words.is_empty() || self.words.iter().all(Option::is_none)
|
||||||
}
|
}
|
||||||
|
|
||||||
// precondition: token has kind Word or StopWord
|
// precondition: token has kind Word or StopWord
|
||||||
|
@ -203,20 +203,15 @@ pub fn compute_phrase_docids(
|
|||||||
if words.is_empty() {
|
if words.is_empty() {
|
||||||
return Ok(RoaringBitmap::new());
|
return Ok(RoaringBitmap::new());
|
||||||
}
|
}
|
||||||
if words.len() == 1 {
|
let mut candidates = RoaringBitmap::new();
|
||||||
if let Some(word) = &words[0] {
|
for word in words.iter().flatten().copied() {
|
||||||
if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? {
|
if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
|
||||||
return Ok(word_docids);
|
candidates |= word_docids;
|
||||||
} else {
|
|
||||||
return Ok(RoaringBitmap::new());
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
return Ok(RoaringBitmap::new());
|
return Ok(RoaringBitmap::new());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut candidates = RoaringBitmap::new();
|
|
||||||
let mut first_iter = true;
|
|
||||||
let winsize = words.len().min(3);
|
let winsize = words.len().min(3);
|
||||||
|
|
||||||
for win in words.windows(winsize) {
|
for win in words.windows(winsize) {
|
||||||
@ -262,12 +257,8 @@ pub fn compute_phrase_docids(
|
|||||||
bitmaps.sort_unstable_by_key(|a| a.len());
|
bitmaps.sort_unstable_by_key(|a| a.len());
|
||||||
|
|
||||||
for bitmap in bitmaps {
|
for bitmap in bitmaps {
|
||||||
if first_iter {
|
|
||||||
candidates = bitmap;
|
|
||||||
first_iter = false;
|
|
||||||
} else {
|
|
||||||
candidates &= bitmap;
|
candidates &= bitmap;
|
||||||
}
|
|
||||||
// There will be no match, return early
|
// There will be no match, return early
|
||||||
if candidates.is_empty() {
|
if candidates.is_empty() {
|
||||||
break;
|
break;
|
||||||
|
@ -11,6 +11,7 @@ pub mod sort;
|
|||||||
pub mod typo;
|
pub mod typo;
|
||||||
pub mod typo_proximity;
|
pub mod typo_proximity;
|
||||||
pub mod words_tms;
|
pub mod words_tms;
|
||||||
|
pub mod stop_words;
|
||||||
|
|
||||||
fn collect_field_values(
|
fn collect_field_values(
|
||||||
index: &crate::Index,
|
index: &crate::Index,
|
||||||
|
135
milli/src/search/new/tests/stop_words.rs
Normal file
135
milli/src/search/new/tests/stop_words.rs
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
/*!
|
||||||
|
This module tests the following properties about stop words:
|
||||||
|
- they are not indexed
|
||||||
|
- they are not searchable
|
||||||
|
- they are case sensitive
|
||||||
|
- they are ignored in phrases
|
||||||
|
- If a query consists only of stop words, a placeholder query is used instead
|
||||||
|
- A prefix word is never ignored, even if the prefix is a stop word
|
||||||
|
- Phrases consisting only of stop words are ignored
|
||||||
|
*/
|
||||||
|
|
||||||
|
use std::{collections::BTreeSet, iter::FromIterator};
|
||||||
|
|
||||||
|
use crate::{db_snap, index::tests::TempIndex, Search, SearchResult, TermsMatchingStrategy};
|
||||||
|
|
||||||
|
fn create_index() -> TempIndex {
|
||||||
|
let index = TempIndex::new();
|
||||||
|
|
||||||
|
index
|
||||||
|
.update_settings(|s| {
|
||||||
|
s.set_primary_key("id".to_owned());
|
||||||
|
s.set_searchable_fields(vec!["title".to_owned()]);
|
||||||
|
s.set_stop_words(BTreeSet::from_iter([
|
||||||
|
"to".to_owned(),
|
||||||
|
"The".to_owned(),
|
||||||
|
"xyz".to_owned(),
|
||||||
|
]));
|
||||||
|
})
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
index
|
||||||
|
.add_documents(documents!([
|
||||||
|
{
|
||||||
|
"id": 0,
|
||||||
|
"title": "Shazam!",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 1,
|
||||||
|
"title": "Captain Marvel",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 2,
|
||||||
|
"title": "Escape Room",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 3,
|
||||||
|
"title": "How to Train Your Dragon: The Hidden World",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 4,
|
||||||
|
"title": "Gläss",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 5,
|
||||||
|
"title": "How to Attempt to Train Your Dragon",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": 6,
|
||||||
|
"title": "How to Train Your Dragon: the Hidden World",
|
||||||
|
},
|
||||||
|
]))
|
||||||
|
.unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stop_words_not_indexed() {
|
||||||
|
let index = create_index();
|
||||||
|
db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ignore_stop_words() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
// `the` is treated as a prefix here, so it's not ignored
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("xyz to the");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
|
||||||
|
// `xyz` is treated as a prefix here, so it's not ignored
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("to the xyz");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
|
||||||
|
// `xyz` is not treated as a prefix anymore because of the trailing space, so it's ignored
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("to the xyz ");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("to the dragon xyz");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_stop_words_in_phrase() {
|
||||||
|
let index = create_index();
|
||||||
|
|
||||||
|
let txn = index.read_txn().unwrap();
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("\"how to train your dragon\"");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 6]");
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("how \"to\" train \"the");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("how \"to\" train \"The dragon");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||||
|
|
||||||
|
let mut s = Search::new(&txn, &index);
|
||||||
|
s.query("\"to\"");
|
||||||
|
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||||
|
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||||
|
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5, 6]");
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user