mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-22 21:04:27 +01:00
Add tests for stop words and fix a couple of bugs
This commit is contained in:
parent
dd007dceca
commit
374095d42c
@ -333,7 +333,8 @@ pub fn execute_search(
|
||||
check_sort_criteria(ctx, sort_criteria.as_ref())?;
|
||||
|
||||
let mut located_query_terms = None;
|
||||
let bucket_sort_output = if let Some(query) = query {
|
||||
|
||||
let query_terms = if let Some(query) = query {
|
||||
// We make sure that the analyzer is aware of the stop words
|
||||
// this ensures that the query builder is able to properly remove them.
|
||||
let mut tokbuilder = TokenizerBuilder::new();
|
||||
@ -351,6 +352,16 @@ pub fn execute_search(
|
||||
let tokens = tokenizer.tokenize(query);
|
||||
|
||||
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
|
||||
if query_terms.is_empty() {
|
||||
// Do a placeholder search instead
|
||||
None
|
||||
} else {
|
||||
Some(query_terms)
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let bucket_sort_output = if let Some(query_terms) = query_terms {
|
||||
let graph = QueryGraph::from_query(ctx, &query_terms)?;
|
||||
located_query_terms = Some(query_terms);
|
||||
|
||||
|
@ -241,7 +241,7 @@ impl PhraseBuilder {
|
||||
}
|
||||
|
||||
fn is_empty(&self) -> bool {
|
||||
self.words.is_empty()
|
||||
self.words.is_empty() || self.words.iter().all(Option::is_none)
|
||||
}
|
||||
|
||||
// precondition: token has kind Word or StopWord
|
||||
|
@ -203,20 +203,15 @@ pub fn compute_phrase_docids(
|
||||
if words.is_empty() {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
if words.len() == 1 {
|
||||
if let Some(word) = &words[0] {
|
||||
if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? {
|
||||
return Ok(word_docids);
|
||||
} else {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
for word in words.iter().flatten().copied() {
|
||||
if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
|
||||
candidates |= word_docids;
|
||||
} else {
|
||||
return Ok(RoaringBitmap::new());
|
||||
}
|
||||
}
|
||||
|
||||
let mut candidates = RoaringBitmap::new();
|
||||
let mut first_iter = true;
|
||||
let winsize = words.len().min(3);
|
||||
|
||||
for win in words.windows(winsize) {
|
||||
@ -262,12 +257,8 @@ pub fn compute_phrase_docids(
|
||||
bitmaps.sort_unstable_by_key(|a| a.len());
|
||||
|
||||
for bitmap in bitmaps {
|
||||
if first_iter {
|
||||
candidates = bitmap;
|
||||
first_iter = false;
|
||||
} else {
|
||||
candidates &= bitmap;
|
||||
}
|
||||
candidates &= bitmap;
|
||||
|
||||
// There will be no match, return early
|
||||
if candidates.is_empty() {
|
||||
break;
|
||||
|
@ -11,6 +11,7 @@ pub mod sort;
|
||||
pub mod typo;
|
||||
pub mod typo_proximity;
|
||||
pub mod words_tms;
|
||||
pub mod stop_words;
|
||||
|
||||
fn collect_field_values(
|
||||
index: &crate::Index,
|
||||
|
135
milli/src/search/new/tests/stop_words.rs
Normal file
135
milli/src/search/new/tests/stop_words.rs
Normal file
@ -0,0 +1,135 @@
|
||||
/*!
|
||||
This module tests the following properties about stop words:
|
||||
- they are not indexed
|
||||
- they are not searchable
|
||||
- they are case sensitive
|
||||
- they are ignored in phrases
|
||||
- If a query consists only of stop words, a placeholder query is used instead
|
||||
- A prefix word is never ignored, even if the prefix is a stop word
|
||||
- Phrases consisting only of stop words are ignored
|
||||
*/
|
||||
|
||||
use std::{collections::BTreeSet, iter::FromIterator};
|
||||
|
||||
use crate::{db_snap, index::tests::TempIndex, Search, SearchResult, TermsMatchingStrategy};
|
||||
|
||||
fn create_index() -> TempIndex {
|
||||
let index = TempIndex::new();
|
||||
|
||||
index
|
||||
.update_settings(|s| {
|
||||
s.set_primary_key("id".to_owned());
|
||||
s.set_searchable_fields(vec!["title".to_owned()]);
|
||||
s.set_stop_words(BTreeSet::from_iter([
|
||||
"to".to_owned(),
|
||||
"The".to_owned(),
|
||||
"xyz".to_owned(),
|
||||
]));
|
||||
})
|
||||
.unwrap();
|
||||
|
||||
index
|
||||
.add_documents(documents!([
|
||||
{
|
||||
"id": 0,
|
||||
"title": "Shazam!",
|
||||
},
|
||||
{
|
||||
"id": 1,
|
||||
"title": "Captain Marvel",
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"title": "Escape Room",
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"title": "How to Train Your Dragon: The Hidden World",
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"title": "Gläss",
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"title": "How to Attempt to Train Your Dragon",
|
||||
},
|
||||
{
|
||||
"id": 6,
|
||||
"title": "How to Train Your Dragon: the Hidden World",
|
||||
},
|
||||
]))
|
||||
.unwrap();
|
||||
index
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stop_words_not_indexed() {
|
||||
let index = create_index();
|
||||
db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ignore_stop_words() {
|
||||
let index = create_index();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
// `the` is treated as a prefix here, so it's not ignored
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("xyz to the");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||
|
||||
// `xyz` is treated as a prefix here, so it's not ignored
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("to the xyz");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||
|
||||
// `xyz` is not treated as a prefix anymore because of the trailing space, so it's ignored
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("to the xyz ");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("to the dragon xyz");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stop_words_in_phrase() {
|
||||
let index = create_index();
|
||||
|
||||
let txn = index.read_txn().unwrap();
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("\"how to train your dragon\"");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 6]");
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("how \"to\" train \"the");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("how \"to\" train \"The dragon");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
|
||||
|
||||
let mut s = Search::new(&txn, &index);
|
||||
s.query("\"to\"");
|
||||
s.terms_matching_strategy(TermsMatchingStrategy::Last);
|
||||
let SearchResult { documents_ids, .. } = s.execute().unwrap();
|
||||
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5, 6]");
|
||||
}
|
Loading…
Reference in New Issue
Block a user