Fix bugs and add tests to exactness ranking rule

2024-12-23 21:20:24 +01:00 · 2023-04-25 16:49:08 +02:00 · 2023-04-25 16:49:08 +02:00 · d3a94e8b25
commit d3a94e8b25
parent 8f2e971879
7 changed files with 410 additions and 31 deletions
--- a/milli/src/search/new/exact_attribute.rs
+++ b/milli/src/search/new/exact_attribute.rs
@ -91,6 +91,12 @@ impl State {
        universe: &RoaringBitmap,
        query_graph: &QueryGraph,
    ) -> Result<Self> {
+        // An ordered list of the (remaining) query terms, with data extracted from them:
+        // 0. exact subterm. If it doesn't exist, the term is skipped.
+        // 1. start position of the term
+        // 2. id of the term
+        let mut count_all_positions = 0;
+
        let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> =
            Vec::with_capacity(query_graph.nodes.len() as usize);
        for (_, node) in query_graph.nodes.iter() {
@ -101,6 +107,7 @@ impl State {
                    } else {
                        continue;
                    };
+                    count_all_positions += term.positions.len();
                    exact_term_position_ids.push((
                        exact_term,
                        *term.positions.start(),
@ -195,13 +202,15 @@ impl State {
            if !intersection.is_empty() {
                // TODO: although not really worth it in terms of performance,
                // if would be good to put this in cache for the sake of consistency
-                let candidates_with_exact_word_count = ctx
-                    .index
+                let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
+                    ctx.index
                        .field_id_word_count_docids
-                    .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))?
-                    .unwrap_or_default();
-                // TODO: consider if we must store the candidates as arrays, or if there is a way to perform the union
-                // here.
+                        .get(ctx.txn, &(fid, count_all_positions as u8))?
+                        .unwrap_or_default()
+                } else {
+                    RoaringBitmap::default()
+                };
+
                candidates_per_attribute.push(FieldCandidates {
                    start_with_exact: intersection,
                    exact_word_count: candidates_with_exact_word_count,
--- a/milli/src/search/new/query_graph.rs
+++ b/milli/src/search/new/query_graph.rs
@ -310,11 +310,11 @@ impl QueryGraph {
            rank as u16
        };
        let mut nodes_to_remove = BTreeMap::<u16, SmallBitmap<QueryNode>>::new();
-        let mut at_least_one_phrase = false;
+        let mut at_least_one_mandatory_term = false;
        for (node_id, node) in self.nodes.iter() {
            let QueryNodeData::Term(t) = &node.data else { continue };
-            if t.term_subset.original_phrase(ctx).is_some() {
-                at_least_one_phrase = true;
+            if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() {
+                at_least_one_mandatory_term = true;
                continue;
            }
            let mut cost = 0;
@ -327,7 +327,7 @@ impl QueryGraph {
                .insert(node_id);
        }
        let mut res: Vec<_> = nodes_to_remove.into_values().collect();
-        if !at_least_one_phrase {
+        if !at_least_one_mandatory_term {
            res.pop();
        }
        res
--- a/milli/src/search/new/query_term/mod.rs
+++ b/milli/src/search/new/query_term/mod.rs
@ -4,6 +4,7 @@ mod parse_query;
 mod phrase;

 use std::collections::BTreeSet;
+use std::iter::FromIterator;
 use std::ops::RangeInclusive;

 use compute_derivations::partially_initialized_term_from_word;
@ -30,6 +31,12 @@ pub struct QueryTermSubset {
    zero_typo_subset: NTypoTermSubset,
    one_typo_subset: NTypoTermSubset,
    two_typo_subset: NTypoTermSubset,
+    /// `true` if the term cannot be deleted through the term matching strategy
+    ///
+    /// Note that there are other reasons for which a term cannot be deleted, such as
+    /// being a phrase. In that case, this field could be set to `false`, but it
+    /// still wouldn't be deleteable by the term matching strategy.
+    mandatory: bool,
 }

 #[derive(Clone, PartialEq, Eq, Hash)]
@ -114,6 +121,12 @@ impl ExactTerm {
 }

 impl QueryTermSubset {
+    pub fn is_mandatory(&self) -> bool {
+        self.mandatory
+    }
+    pub fn make_mandatory(&mut self) {
+        self.mandatory = true;
+    }
    pub fn exact_term(&self, ctx: &SearchContext) -> Option<ExactTerm> {
        let full_query_term = ctx.term_interner.get(self.original);
        if full_query_term.ngram_words.is_some() {
@ -135,6 +148,7 @@ impl QueryTermSubset {
            zero_typo_subset: NTypoTermSubset::Nothing,
            one_typo_subset: NTypoTermSubset::Nothing,
            two_typo_subset: NTypoTermSubset::Nothing,
+            mandatory: false,
        }
    }
    pub fn full(for_term: Interned<QueryTerm>) -> Self {
@ -143,6 +157,7 @@ impl QueryTermSubset {
            zero_typo_subset: NTypoTermSubset::All,
            one_typo_subset: NTypoTermSubset::All,
            two_typo_subset: NTypoTermSubset::All,
+            mandatory: false,
        }
    }

@ -352,6 +367,28 @@ impl QueryTermSubset {
            _ => panic!(),
        }
    }
+    pub fn keep_only_exact_term(&mut self, ctx: &SearchContext) {
+        if let Some(term) = self.exact_term(ctx) {
+            match term {
+                ExactTerm::Phrase(p) => {
+                    self.zero_typo_subset = NTypoTermSubset::Subset {
+                        words: BTreeSet::new(),
+                        phrases: BTreeSet::from_iter([p]),
+                    };
+                    self.clear_one_typo_subset();
+                    self.clear_two_typo_subset();
+                }
+                ExactTerm::Word(w) => {
+                    self.zero_typo_subset = NTypoTermSubset::Subset {
+                        words: BTreeSet::from_iter([w]),
+                        phrases: BTreeSet::new(),
+                    };
+                    self.clear_one_typo_subset();
+                    self.clear_two_typo_subset();
+                }
+            }
+        }
+    }
    pub fn clear_zero_typo_subset(&mut self) {
        self.zero_typo_subset = NTypoTermSubset::Nothing;
    }
--- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
+++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs
@ -34,7 +34,7 @@ fn compute_docids(
            }
        }
    };
-    // TODO: synonyms?
+
    candidates &= universe;
    Ok(candidates)
 }
@ -47,18 +47,21 @@ impl RankingRuleGraphTrait for ExactnessGraph {
        condition: &Self::Condition,
        universe: &RoaringBitmap,
    ) -> Result<ComputedCondition> {
-        let (docids, dest_node) = match condition {
+        let (docids, end_term_subset) = match condition {
            ExactnessCondition::ExactInAttribute(dest_node) => {
-                (compute_docids(ctx, dest_node, universe)?, dest_node)
+                let mut end_term_subset = dest_node.clone();
+                end_term_subset.term_subset.keep_only_exact_term(ctx);
+                end_term_subset.term_subset.make_mandatory();
+                (compute_docids(ctx, dest_node, universe)?, end_term_subset)
            }
-            ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node),
+            ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node.clone()),
        };
+
        Ok(ComputedCondition {
            docids,
            universe_len: universe.len(),
            start_term_subset: None,
-            // TODO/FIXME:  modify `end_term_subset` to signal to the next ranking rules that the term cannot be removed
-            end_term_subset: dest_node.clone(),
+            end_term_subset,
        })
    }

--- a/milli/src/search/new/resolve_query_graph.rs
+++ b/milli/src/search/new/resolve_query_graph.rs
@ -165,8 +165,8 @@ pub fn compute_query_graph_docids(
                positions: _,
                term_ids: _,
            }) => {
-                let phrase_docids = compute_query_term_subset_docids(ctx, term_subset)?;
-                predecessors_docids & phrase_docids
+                let node_docids = compute_query_term_subset_docids(ctx, term_subset)?;
+                predecessors_docids & node_docids
            }
            QueryNodeData::Deleted => {
                panic!()
--- a/milli/src/search/new/tests/exactness.rs
+++ b/milli/src/search/new/tests/exactness.rs
@ -14,6 +14,11 @@ This module tests the following properties about the exactness ranking rule:
    1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap"
    2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap"
    3. those that contain the most exact words from the remaining query
+
+- if it is followed by other ranking rules, then:
+    1. `word` will not remove the exact terms matched by `exactness`
+    2. graph-based ranking rules (`typo`, `proximity`, `attribute`) will only work with
+       (1) the exact terms selected by `exactness` or (2) the full query term otherwise
 */

 use crate::{
@ -21,7 +26,7 @@ use crate::{
    SearchResult, TermsMatchingStrategy,
 };

-fn create_index_exact_words_simple_ordered() -> TempIndex {
+fn create_index_simple_ordered() -> TempIndex {
    let index = TempIndex::new();

    index
@ -80,7 +85,7 @@ fn create_index_exact_words_simple_ordered() -> TempIndex {
    index
 }

-fn create_index_exact_words_simple_reversed() -> TempIndex {
+fn create_index_simple_reversed() -> TempIndex {
    let index = TempIndex::new();

    index
@ -138,7 +143,7 @@ fn create_index_exact_words_simple_reversed() -> TempIndex {
    index
 }

-fn create_index_exact_words_simple_random() -> TempIndex {
+fn create_index_simple_random() -> TempIndex {
    let index = TempIndex::new();

    index
@ -242,9 +247,192 @@ fn create_index_attribute_starts_with() -> TempIndex {
    index
 }

+fn create_index_simple_ordered_with_typos() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Exactness]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            {
+                "id": 0,
+                "text": "",
+            },
+            {
+                "id": 1,
+                "text": "the",
+            },
+            {
+                "id": 2,
+                "text": "the quack",
+            },
+            {
+                "id": 3,
+                "text": "the quack briwn",
+            },
+            {
+                "id": 4,
+                "text": "the quack briwn fox",
+            },
+            {
+                "id": 5,
+                "text": "the quack briwn fox jlmps",
+            },
+            {
+                "id": 6,
+                "text": "the quack briwn fox jlmps over",
+            },
+            {
+                "id": 7,
+                "text": "the quack briwn fox jlmps over the",
+            },
+            {
+                "id": 8,
+                "text": "the quack briwn fox jlmps over the lazy",
+            },
+            {
+                "id": 9,
+                "text": "the quack briwn fox jlmps over the lazy dog",
+            },
+            {
+                "id": 10,
+                "text": "",
+            },
+            {
+                "id": 11,
+                "text": "the",
+            },
+            {
+                "id": 12,
+                "text": "the quick",
+            },
+            {
+                "id": 13,
+                "text": "the quick brown",
+            },
+            {
+                "id": 14,
+                "text": "the quick brown fox",
+            },
+            {
+                "id": 15,
+                "text": "the quick brown fox jumps",
+            },
+
+            {
+                "id": 16,
+                "text": "the quick brown fox jumps over",
+            },
+            {
+                "id": 17,
+                "text": "the quick brown fox jumps over the",
+            },
+            {
+                "id": 18,
+                "text": "the quick brown fox jumps over the lazy",
+            },
+            {
+                "id": 19,
+                "text": "the quick brown fox jumps over the lazy dog",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
+fn create_index_with_varying_proximities() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            {
+                "id": 0,
+                "text": "lazy jumps dog brown quick the over fox the",
+            },
+            {
+                "id": 1,
+                "text": "the quick brown fox jumps over the very lazy dog"
+            },
+            {
+                "id": 2,
+                "text": "the quick brown fox jumps over the lazy dog",
+            },
+            {
+                "id": 3,
+                "text": "dog brown quick the over fox the lazy",
+            },
+            {
+                "id": 4,
+                "text": "the quick brown fox over the very lazy dog"
+            },
+            {
+                "id": 5,
+                "text": "the quick brown fox over the lazy dog",
+            },
+            {
+                "id": 6,
+                "text": "brown quick the over fox",
+            },
+            {
+                "id": 7,
+                "text": "the very quick brown fox over"
+            },
+            {
+                "id": 8,
+                "text": "the quick brown fox over",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
+fn create_index_all_equal_except_proximity_between_ignored_terms() -> TempIndex {
+    let index = TempIndex::new();
+
+    index
+        .update_settings(|s| {
+            s.set_primary_key("id".to_owned());
+            s.set_searchable_fields(vec!["text".to_owned()]);
+            s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
+        })
+        .unwrap();
+
+    index
+        .add_documents(documents!([
+            {
+                "id": 0,
+                "text": "lazy jumps dog brown quick the over fox the"
+            },
+            {
+                "id": 1,
+                "text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps",
+            },
+            {
+                "id": 2,
+                "text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps overt",
+            },
+        ]))
+        .unwrap();
+    index
+}
+
 #[test]
 fn test_exactness_simple_ordered() {
-    let index = create_index_exact_words_simple_ordered();
+    let index = create_index_simple_ordered();

    let txn = index.read_txn().unwrap();

@ -271,7 +459,7 @@ fn test_exactness_simple_ordered() {

 #[test]
 fn test_exactness_simple_reversed() {
-    let index = create_index_exact_words_simple_reversed();
+    let index = create_index_simple_reversed();

    let txn = index.read_txn().unwrap();

@ -318,7 +506,7 @@ fn test_exactness_simple_reversed() {

 #[test]
 fn test_exactness_simple_random() {
-    let index = create_index_exact_words_simple_random();
+    let index = create_index_simple_random();

    let txn = index.read_txn().unwrap();

@ -377,13 +565,12 @@ fn test_exactness_attribute_starts_with_phrase() {
    s.terms_matching_strategy(TermsMatchingStrategy::Last);
    s.query("\"overlooking the sea\" is a beautiful balcony");
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
-    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 6, 4, 3, 1, 0, 2]");
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: this is incorrect, the first document returned here should actually be the second one
    insta::assert_debug_snapshot!(texts, @r###"
    [
-        "\"overlooking the sea is a beautiful balcony, I love it\"",
        "\"overlooking the sea is a beautiful balcony\"",
+        "\"overlooking the sea is a beautiful balcony, I love it\"",
        "\"a beautiful balcony is overlooking the sea\"",
        "\"over looking the sea is a beautiful balcony\"",
        "\"this balcony is overlooking the sea\"",
@ -398,7 +585,6 @@ fn test_exactness_attribute_starts_with_phrase() {
    let SearchResult { documents_ids, .. } = s.execute().unwrap();
    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2, 7]");
    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
-    // TODO: this is correct, so the exactness ranking rule probably has a bug in the handling of phrases
    insta::assert_debug_snapshot!(texts, @r###"
    [
        "\"overlooking the sea is a beautiful balcony\"",
@ -440,3 +626,148 @@ fn test_exactness_all_candidates_with_typo() {
    ]
    "###);
 }
+
+#[test]
+fn test_exactness_after_words() {
+    let index = create_index_simple_ordered_with_typos();
+
+    index
+        .update_settings(|s| {
+            s.set_criteria(vec![Criterion::Words, Criterion::Exactness]);
+        })
+        .unwrap();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 9, 18, 8, 17, 16, 6, 7, 15, 5, 14, 4, 13, 3, 12, 2, 1, 11]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the quick brown fox jumps over the lazy dog\"",
+        "\"the quack briwn fox jlmps over the lazy dog\"",
+        "\"the quick brown fox jumps over the lazy\"",
+        "\"the quack briwn fox jlmps over the lazy\"",
+        "\"the quick brown fox jumps over the\"",
+        "\"the quick brown fox jumps over\"",
+        "\"the quack briwn fox jlmps over\"",
+        "\"the quack briwn fox jlmps over the\"",
+        "\"the quick brown fox jumps\"",
+        "\"the quack briwn fox jlmps\"",
+        "\"the quick brown fox\"",
+        "\"the quack briwn fox\"",
+        "\"the quick brown\"",
+        "\"the quack briwn\"",
+        "\"the quick\"",
+        "\"the quack\"",
+        "\"the\"",
+        "\"the\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_words_after_exactness() {
+    let index = create_index_simple_ordered_with_typos();
+
+    index
+        .update_settings(|s| {
+            s.set_criteria(vec![Criterion::Exactness, Criterion::Words]);
+        })
+        .unwrap();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 16, 17, 9, 15, 8, 14, 6, 7, 13, 5, 4, 12, 3, 2, 1, 11]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the quick brown fox jumps over the lazy dog\"",
+        "\"the quick brown fox jumps over the lazy\"",
+        "\"the quick brown fox jumps over\"",
+        "\"the quick brown fox jumps over the\"",
+        "\"the quack briwn fox jlmps over the lazy dog\"",
+        "\"the quick brown fox jumps\"",
+        "\"the quack briwn fox jlmps over the lazy\"",
+        "\"the quick brown fox\"",
+        "\"the quack briwn fox jlmps over\"",
+        "\"the quack briwn fox jlmps over the\"",
+        "\"the quick brown\"",
+        "\"the quack briwn fox jlmps\"",
+        "\"the quack briwn fox\"",
+        "\"the quick\"",
+        "\"the quack briwn\"",
+        "\"the quack\"",
+        "\"the\"",
+        "\"the\"",
+    ]
+    "###);
+}
+
+#[test]
+fn test_proximity_after_exactness() {
+    let index = create_index_with_varying_proximities();
+
+    index
+        .update_settings(|s| {
+            s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
+        })
+        .unwrap();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 5, 4, 3, 8, 6, 7]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"the quick brown fox jumps over the lazy dog\"",
+        "\"the quick brown fox jumps over the very lazy dog\"",
+        "\"lazy jumps dog brown quick the over fox the\"",
+        "\"the quick brown fox over the lazy dog\"",
+        "\"the quick brown fox over the very lazy dog\"",
+        "\"dog brown quick the over fox the lazy\"",
+        "\"the quick brown fox over\"",
+        "\"brown quick the over fox\"",
+        "\"the very quick brown fox over\"",
+    ]
+    "###);
+
+    let index = create_index_all_equal_except_proximity_between_ignored_terms();
+
+    index
+        .update_settings(|s| {
+            s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]);
+        })
+        .unwrap();
+
+    let txn = index.read_txn().unwrap();
+
+    let mut s = Search::new(&txn, &index);
+    s.terms_matching_strategy(TermsMatchingStrategy::Last);
+    s.query("the quick brown fox jumps over the lazy dog");
+    let SearchResult { documents_ids, .. } = s.execute().unwrap();
+    insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]");
+    let texts = collect_field_values(&index, &txn, "text", &documents_ids);
+
+    insta::assert_debug_snapshot!(texts, @r###"
+    [
+        "\"lazy jumps dog brown quick the over fox the\"",
+        "\"lazy jumps dog brown quick the over fox the. quack briwn jlmps\"",
+        "\"lazy jumps dog brown quick the over fox the. quack briwn jlmps overt\"",
+    ]
+    "###);
+}
--- a/milli/src/search/new/words.rs
+++ b/milli/src/search/new/words.rs
@ -33,7 +33,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words {
        &mut self,
        ctx: &mut SearchContext<'ctx>,
        _logger: &mut dyn SearchLogger<QueryGraph>,
-        _parent_candidates: &RoaringBitmap,
+        _universe: &RoaringBitmap,
        parent_query_graph: &QueryGraph,
    ) -> Result<()> {
        self.exhausted = false;
@ -77,7 +77,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words {
            let nodes_to_remove = self.nodes_to_remove.pop().unwrap();
            query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::<Vec<_>>());
        }
-
        Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket }))
    }