Add tests for stop words and fix a couple of bugs

This commit is contained in:
Loïc Lecrenier 2023-04-27 13:30:09 +02:00
parent dd007dceca
commit 374095d42c
5 changed files with 155 additions and 17 deletions

View File

@ -333,7 +333,8 @@ pub fn execute_search(
check_sort_criteria(ctx, sort_criteria.as_ref())?;
let mut located_query_terms = None;
let bucket_sort_output = if let Some(query) = query {
let query_terms = if let Some(query) = query {
// We make sure that the analyzer is aware of the stop words
// this ensures that the query builder is able to properly remove them.
let mut tokbuilder = TokenizerBuilder::new();
@ -351,6 +352,16 @@ pub fn execute_search(
let tokens = tokenizer.tokenize(query);
let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
if query_terms.is_empty() {
// Do a placeholder search instead
None
} else {
Some(query_terms)
}
} else {
None
};
let bucket_sort_output = if let Some(query_terms) = query_terms {
let graph = QueryGraph::from_query(ctx, &query_terms)?;
located_query_terms = Some(query_terms);

View File

@ -241,7 +241,7 @@ impl PhraseBuilder {
}
fn is_empty(&self) -> bool {
self.words.is_empty()
self.words.is_empty() || self.words.iter().all(Option::is_none)
}
// precondition: token has kind Word or StopWord

View File

@ -203,20 +203,15 @@ pub fn compute_phrase_docids(
if words.is_empty() {
return Ok(RoaringBitmap::new());
}
if words.len() == 1 {
if let Some(word) = &words[0] {
if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? {
return Ok(word_docids);
} else {
return Ok(RoaringBitmap::new());
}
let mut candidates = RoaringBitmap::new();
for word in words.iter().flatten().copied() {
if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
candidates |= word_docids;
} else {
return Ok(RoaringBitmap::new());
}
}
let mut candidates = RoaringBitmap::new();
let mut first_iter = true;
let winsize = words.len().min(3);
for win in words.windows(winsize) {
@ -262,12 +257,8 @@ pub fn compute_phrase_docids(
bitmaps.sort_unstable_by_key(|a| a.len());
for bitmap in bitmaps {
if first_iter {
candidates = bitmap;
first_iter = false;
} else {
candidates &= bitmap;
}
candidates &= bitmap;
// There will be no match, return early
if candidates.is_empty() {
break;

View File

@ -11,6 +11,7 @@ pub mod sort;
pub mod typo;
pub mod typo_proximity;
pub mod words_tms;
pub mod stop_words;
fn collect_field_values(
index: &crate::Index,

View File

@ -0,0 +1,135 @@
/*!
This module tests the following properties about stop words:
- they are not indexed
- they are not searchable
- they are case sensitive
- they are ignored in phrases
- If a query consists only of stop words, a placeholder query is used instead
- A prefix word is never ignored, even if the prefix is a stop word
- Phrases consisting only of stop words are ignored
*/
use std::{collections::BTreeSet, iter::FromIterator};
use crate::{db_snap, index::tests::TempIndex, Search, SearchResult, TermsMatchingStrategy};
fn create_index() -> TempIndex {
let index = TempIndex::new();
index
.update_settings(|s| {
s.set_primary_key("id".to_owned());
s.set_searchable_fields(vec!["title".to_owned()]);
s.set_stop_words(BTreeSet::from_iter([
"to".to_owned(),
"The".to_owned(),
"xyz".to_owned(),
]));
})
.unwrap();
index
.add_documents(documents!([
{
"id": 0,
"title": "Shazam!",
},
{
"id": 1,
"title": "Captain Marvel",
},
{
"id": 2,
"title": "Escape Room",
},
{
"id": 3,
"title": "How to Train Your Dragon: The Hidden World",
},
{
"id": 4,
"title": "Gläss",
},
{
"id": 5,
"title": "How to Attempt to Train Your Dragon",
},
{
"id": 6,
"title": "How to Train Your Dragon: the Hidden World",
},
]))
.unwrap();
index
}
#[test]
fn test_stop_words_not_indexed() {
let index = create_index();
db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
}
#[test]
fn test_ignore_stop_words() {
let index = create_index();
let txn = index.read_txn().unwrap();
// `the` is treated as a prefix here, so it's not ignored
let mut s = Search::new(&txn, &index);
s.query("xyz to the");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
// `xyz` is treated as a prefix here, so it's not ignored
let mut s = Search::new(&txn, &index);
s.query("to the xyz");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
// `xyz` is not treated as a prefix anymore because of the trailing space, so it's ignored
let mut s = Search::new(&txn, &index);
s.query("to the xyz ");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
let mut s = Search::new(&txn, &index);
s.query("to the dragon xyz");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
}
#[test]
fn test_stop_words_in_phrase() {
let index = create_index();
let txn = index.read_txn().unwrap();
let mut s = Search::new(&txn, &index);
s.query("\"how to train your dragon\"");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 6]");
let mut s = Search::new(&txn, &index);
s.query("how \"to\" train \"the");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
let mut s = Search::new(&txn, &index);
s.query("how \"to\" train \"The dragon");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
let mut s = Search::new(&txn, &index);
s.query("\"to\"");
s.terms_matching_strategy(TermsMatchingStrategy::Last);
let SearchResult { documents_ids, .. } = s.execute().unwrap();
insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5, 6]");
}