Add tests for stop words and fix a couple of bugs

2024-11-22 21:04:27 +01:00 · 2023-04-27 13:30:09 +02:00 · 2023-04-27 13:30:09 +02:00 · 374095d42c
commit 374095d42c
parent dd007dceca
5 changed files with 155 additions and 17 deletions
--- a/milli/src/search/new/mod.rs
+++ b/milli/src/search/new/mod.rs
@ -333,7 +333,8 @@ pub fn execute_search(
    check_sort_criteria(ctx, sort_criteria.as_ref())?;

    let mut located_query_terms = None;
-    let bucket_sort_output = if let Some(query) = query {
+
+    let query_terms = if let Some(query) = query {
        // We make sure that the analyzer is aware of the stop words
        // this ensures that the query builder is able to properly remove them.
        let mut tokbuilder = TokenizerBuilder::new();
@ -351,6 +352,16 @@ pub fn execute_search(
        let tokens = tokenizer.tokenize(query);

        let query_terms = located_query_terms_from_string(ctx, tokens, words_limit)?;
+        if query_terms.is_empty() {
+            // Do a placeholder search instead
+            None
+        } else {
+            Some(query_terms)
+        }
+    } else {
+        None
+    };
+    let bucket_sort_output = if let Some(query_terms) = query_terms {
        let graph = QueryGraph::from_query(ctx, &query_terms)?;
        located_query_terms = Some(query_terms);

--- a/milli/src/search/new/query_term/parse_query.rs
+++ b/milli/src/search/new/query_term/parse_query.rs
@ -241,7 +241,7 @@ impl PhraseBuilder {
    }

    fn is_empty(&self) -> bool {
-        self.words.is_empty()
+        self.words.is_empty() || self.words.iter().all(Option::is_none)
    }

    // precondition: token has kind Word or StopWord
--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@ -203,20 +203,15 @@ pub fn compute_phrase_docids(
    if words.is_empty() {
        return Ok(RoaringBitmap::new());
    }
-    if words.len() == 1 {
-        if let Some(word) = &words[0] {
-            if let Some(word_docids) = ctx.word_docids(Word::Original(*word))? {
-                return Ok(word_docids);
-            } else {
-                return Ok(RoaringBitmap::new());
-            }
+    let mut candidates = RoaringBitmap::new();
+    for word in words.iter().flatten().copied() {
+        if let Some(word_docids) = ctx.word_docids(Word::Original(word))? {
+            candidates |= word_docids;
        } else {
            return Ok(RoaringBitmap::new());
        }
    }

-    let mut candidates = RoaringBitmap::new();
-    let mut first_iter = true;
    let winsize = words.len().min(3);

    for win in words.windows(winsize) {
@ -262,12 +257,8 @@ pub fn compute_phrase_docids(
        bitmaps.sort_unstable_by_key(|a| a.len());

        for bitmap in bitmaps {
-            if first_iter {
-                candidates = bitmap;
-                first_iter = false;
-            } else {
            candidates &= bitmap;
-            }
+
            // There will be no match, return early
            if candidates.is_empty() {
                break;
--- a/milli/src/search/new/tests/mod.rs
+++ b/milli/src/search/new/tests/mod.rs
@ -11,6 +11,7 @@ pub mod sort;
 pub mod typo;
 pub mod typo_proximity;
 pub mod words_tms;
+pub mod stop_words;

 fn collect_field_values(
    index: &crate::Index,
--- a/milli/src/search/new/tests/stop_words.rs
+++ b/milli/src/search/new/tests/stop_words.rs
@ -0,0 +1,135 @@
+/*!
+This module tests the following properties about stop words:
+- they are not indexed
+- they are not searchable
+- they are case sensitive
+- they are ignored in phrases
+- If a query consists only of stop words, a placeholder query is used instead
+- A prefix word is never ignored, even if the prefix is a stop word
+- Phrases consisting only of stop words are ignored
+*/
+
+use std::{collections::BTreeSet, iter::FromIterator};
+
+use crate::{db_snap, index::tests::TempIndex, Search, SearchResult, TermsMatchingStrategy};
+
+fn create_index() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["title".to_owned()]);
+            s.set_stop_words(BTreeSet::from_iter([
+                "to".to_owned(),
+                "The".to_owned(),
+                "xyz".to_owned(),
+            ]));
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+        {
+            "id": 0,
+            "title": "Shazam!",
+        },
+        {
+            "id": 1,
+            "title": "Captain Marvel",
+        },
+        {
+            "id": 2,
+            "title": "Escape Room",
+        },
+        {
+            "id": 3,
+            "title": "How to Train Your Dragon: The Hidden World",
+        },
+        {
+            "id": 4,
+            "title": "Gläss",
+        },
+        {
+            "id": 5,
+            "title": "How to Attempt to Train Your Dragon",
+        },
+        {
+            "id": 6,
+            "title": "How to Train Your Dragon: the Hidden World",
+        },
+        ]))
+        .unwrap();
+    index
+}
+
+#[test]
+fn test_stop_words_not_indexed() {
+    let index = create_index();
+    db_snap!(index, word_docids, @"6288f9d7db3703b02c57025eb4a69264");
+}
+
+#[test]
+fn test_ignore_stop_words() {
+    let index = create_index();
+
+    let txn = index.read_txn().unwrap();
+
+    // `the` is treated as a prefix here, so it's not ignored
+    let mut s = Search::new(&txn, &index);
+    s.query("xyz to the");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
+
+    // `xyz` is treated as a prefix here, so it's not ignored
+    let mut s = Search::new(&txn, &index);
+    s.query("to the xyz");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
+
+    // `xyz` is not treated as a prefix anymore because of the trailing space, so it's ignored
+    let mut s = Search::new(&txn, &index);
+    s.query("to the xyz ");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
+
+    let mut s = Search::new(&txn, &index);
+    s.query("to the dragon xyz");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
+}
+
+#[test]
+fn test_stop_words_in_phrase() {
+    let index = create_index();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.query("\"how to train your dragon\"");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[3, 6]");
+
+    let mut s = Search::new(&txn, &index);
+    s.query("how \"to\" train \"the");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6]");
+
+    let mut s = Search::new(&txn, &index);
+    s.query("how \"to\" train \"The dragon");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[]");
+
+    let mut s = Search::new(&txn, &index);
+    s.query("\"to\"");
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5, 6]");
+}