From 374095d42c8d0f00930cb3449fc1aeda25652445 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 27 Apr 2023 13:30:09 +0200 Subject: [PATCH] Add tests for stop words and fix a couple of bugs --- milli/src/search/new/mod.rs | 13 +- .../src/search/new/query_term/parse_query.rs | 2 +- milli/src/search/new/resolve_query_graph.rs | 21 +-- milli/src/search/new/tests/mod.rs | 1 + milli/src/search/new/tests/stop_words.rs | 135 ++++++++++++++++++ 5 files changed, 155 insertions(+), 17 deletions(-) create mode 100644 milli/src/search/new/tests/stop_words.rs diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 375f7c774..246745678 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -333,7 +333,8 @@ pub fn execute_search( check_sort_criteria(ctx, sort_criteria.as_ref())?; let mut located_query_terms = None; - let bucket_sort_output = if let Some(query) = query { + + let query_terms = if let Some(query) = query { // We make sure that the analyzer is aware of the stop words // this ensures that the query builder is able to properly remove them. let mut tokbuilder = TokenizerBuilder::new(); @@ -351,6 +352,16 @@ pub fn execute_search( let tokens = tokenizer.tokenize(query); let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?; + if query_terms.is_empty() { + // Do a placeholder search instead + None + } else { + Some(query_terms) + } + } else { + None + }; + let bucket_sort_output = if let Some(query_terms) = query_terms { let graph = QueryGraph::from_query(ctx, &query_terms)?; located_query_terms = Some(query_terms); diff --git a/milli/src/search/new/query_term/parse_query.rs b/milli/src/search/new/query_term/parse_query.rs index 91b888dcf..734938551 100644 --- a/milli/src/search/new/query_term/parse_query.rs +++ b/milli/src/search/new/query_term/parse_query.rs @@ -241,7 +241,7 @@ impl PhraseBuilder { } fn is_empty(&self) -> bool { - self.words.is_empty() + self.words.is_empty() || self.words.iter().all(Option::is_none) } // precondition: token has kind Word or StopWord diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index d16162b1b..131dcd856 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -203,20 +203,15 @@ pub fn compute_phrase_docids( if words.is_empty() { return Ok(RoaringBitmap::new()); } - if words.len() == 1 { - if let Some(word) = &words[0] { - if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? { - return Ok(word_docids); - } else { - return Ok(RoaringBitmap::new()); - } + let mut candidates = RoaringBitmap::new(); + for word in words.iter().flatten().copied() { + if let Some(word_docids) = ctx.word_docids(Word::Original(word))? { + candidates |= word_docids; } else { return Ok(RoaringBitmap::new()); } } - let mut candidates = RoaringBitmap::new(); - let mut first_iter = true; let winsize = words.len().min(3); for win in words.windows(winsize) { @@ -262,12 +257,8 @@ pub fn compute_phrase_docids( bitmaps.sort_unstable_by_key(|a| a.len()); for bitmap in bitmaps { - if first_iter { - candidates = bitmap; - first_iter = false; - } else { - candidates &= bitmap; - } + candidates &= bitmap; + // There will be no match, return early if candidates.is_empty() { break; diff --git a/milli/src/search/new/tests/mod.rs b/milli/src/search/new/tests/mod.rs index 1194d32ac..cdcdb5936 100644 --- a/milli/src/search/new/tests/mod.rs +++ b/milli/src/search/new/tests/mod.rs @@ -11,6 +11,7 @@ pub mod sort; pub mod typo; pub mod typo_proximity; pub mod words_tms; +pub mod stop_words; fn collect_field_values( index: &crate::Index, diff --git a/milli/src/search/new/tests/stop_words.rs b/milli/src/search/new/tests/stop_words.rs new file mode 100644 index 000000000..96dd06584 --- /dev/null +++ b/milli/src/search/new/tests/stop_words.rs @@ -0,0 +1,135 @@ +/*! +This module tests the following properties about stop words: +- they are not indexed +- they are not searchable +- they are case sensitive +- they are ignored in phrases +- If a query consists only of stop words, a placeholder query is used instead +- A prefix word is never ignored, even if the prefix is a stop word +- Phrases consisting only of stop words are ignored +*/ + +use std::{collections::BTreeSet, iter::FromIterator}; + +use crate::{db_snap, index::tests::TempIndex, Search, SearchResult, TermsMatchingStrategy}; + +fn create_index() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["title".to_owned()]); + s.set_stop_words(BTreeSet::from_iter([ + "to".to_owned(), + "The".to_owned(), + "xyz".to_owned(), + ])); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "title": "Shazam!", + }, + { + "id": 1, + "title": "Captain Marvel", + }, + { + "id": 2, + "title": "Escape Room", + }, + { + "id": 3, + "title": "How to Train Your Dragon: The Hidden World", + }, + { + "id": 4, + "title": "Gläss", + }, + { + "id": 5, + "title": "How to Attempt to Train Your Dragon", + }, + { + "id": 6, + "title": "How to Train Your Dragon: the Hidden World", + }, + ])) + .unwrap(); + index +} + +#[test] +fn test_stop_words_not_indexed() { + let index = create_index(); + db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264"); +} + +#[test] +fn test_ignore_stop_words() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + // `the` is treated as a prefix here, so it's not ignored + let mut s = Search::new(&txn, &index); + s.query("xyz to the"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + // `xyz` is treated as a prefix here, so it's not ignored + let mut s = Search::new(&txn, &index); + s.query("to the xyz"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + // `xyz` is not treated as a prefix anymore because of the trailing space, so it's ignored + let mut s = Search::new(&txn, &index); + s.query("to the xyz "); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + let mut s = Search::new(&txn, &index); + s.query("to the dragon xyz"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); +} + +#[test] +fn test_stop_words_in_phrase() { + let index = create_index(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.query("\"how to train your dragon\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 6]"); + + let mut s = Search::new(&txn, &index); + s.query("how \"to\" train \"the"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]"); + + let mut s = Search::new(&txn, &index); + s.query("how \"to\" train \"The dragon"); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]"); + + let mut s = Search::new(&txn, &index); + s.query("\"to\""); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5, 6]"); +}