From d3a94e8b25b7d489b5591154bdd1ed6e5530a945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Tue, 25 Apr 2023 16:49:08 +0200 Subject: [PATCH] Fix bugs and add tests to exactness ranking rule --- milli/src/search/new/exact_attribute.rs | 23 +- milli/src/search/new/query_graph.rs | 8 +- milli/src/search/new/query_term/mod.rs | 37 ++ .../new/ranking_rule_graph/exactness/mod.rs | 15 +- milli/src/search/new/resolve_query_graph.rs | 4 +- milli/src/search/new/tests/exactness.rs | 351 +++++++++++++++++- milli/src/search/new/words.rs | 3 +- 7 files changed, 410 insertions(+), 31 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index bc0195ebc..0b95243bc 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -91,6 +91,12 @@ impl State { universe: &RoaringBitmap, query_graph: &QueryGraph, ) -> Result { + // An ordered list of the (remaining) query terms, with data extracted from them: + // 0. exact subterm. If it doesn't exist, the term is skipped. + // 1. start position of the term + // 2. id of the term + let mut count_all_positions = 0; + let mut exact_term_position_ids: Vec<(ExactTerm, u16, u8)> = Vec::with_capacity(query_graph.nodes.len() as usize); for (_, node) in query_graph.nodes.iter() { @@ -101,6 +107,7 @@ impl State { } else { continue; }; + count_all_positions += term.positions.len(); exact_term_position_ids.push(( exact_term, *term.positions.start(), @@ -195,13 +202,15 @@ impl State { if !intersection.is_empty() { // TODO: although not really worth it in terms of performance, // if would be good to put this in cache for the sake of consistency - let candidates_with_exact_word_count = ctx - .index - .field_id_word_count_docids - .get(ctx.txn, &(fid, exact_term_position_ids.len() as u8))? - .unwrap_or_default(); - // TODO: consider if we must store the candidates as arrays, or if there is a way to perform the union - // here. + let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { + ctx.index + .field_id_word_count_docids + .get(ctx.txn, &(fid, count_all_positions as u8))? + .unwrap_or_default() + } else { + RoaringBitmap::default() + }; + candidates_per_attribute.push(FieldCandidates { start_with_exact: intersection, exact_word_count: candidates_with_exact_word_count, diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 2662ef730..155e6ad75 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -310,11 +310,11 @@ impl QueryGraph { rank as u16 }; let mut nodes_to_remove = BTreeMap::>::new(); - let mut at_least_one_phrase = false; + let mut at_least_one_mandatory_term = false; for (node_id, node) in self.nodes.iter() { let QueryNodeData::Term(t) = &node.data else { continue }; - if t.term_subset.original_phrase(ctx).is_some() { - at_least_one_phrase = true; + if t.term_subset.original_phrase(ctx).is_some() || t.term_subset.is_mandatory() { + at_least_one_mandatory_term = true; continue; } let mut cost = 0; @@ -327,7 +327,7 @@ impl QueryGraph { .insert(node_id); } let mut res: Vec<_> = nodes_to_remove.into_values().collect(); - if !at_least_one_phrase { + if !at_least_one_mandatory_term { res.pop(); } res diff --git a/milli/src/search/new/query_term/mod.rs b/milli/src/search/new/query_term/mod.rs index 5ee29615b..5f1a45d83 100644 --- a/milli/src/search/new/query_term/mod.rs +++ b/milli/src/search/new/query_term/mod.rs @@ -4,6 +4,7 @@ mod parse_query; mod phrase; use std::collections::BTreeSet; +use std::iter::FromIterator; use std::ops::RangeInclusive; use compute_derivations::partially_initialized_term_from_word; @@ -30,6 +31,12 @@ pub struct QueryTermSubset { zero_typo_subset: NTypoTermSubset, one_typo_subset: NTypoTermSubset, two_typo_subset: NTypoTermSubset, + /// `true` if the term cannot be deleted through the term matching strategy + /// + /// Note that there are other reasons for which a term cannot be deleted, such as + /// being a phrase. In that case, this field could be set to `false`, but it + /// still wouldn't be deleteable by the term matching strategy. + mandatory: bool, } #[derive(Clone, PartialEq, Eq, Hash)] @@ -114,6 +121,12 @@ impl ExactTerm { } impl QueryTermSubset { + pub fn is_mandatory(&self) -> bool { + self.mandatory + } + pub fn make_mandatory(&mut self) { + self.mandatory = true; + } pub fn exact_term(&self, ctx: &SearchContext) -> Option { let full_query_term = ctx.term_interner.get(self.original); if full_query_term.ngram_words.is_some() { @@ -135,6 +148,7 @@ impl QueryTermSubset { zero_typo_subset: NTypoTermSubset::Nothing, one_typo_subset: NTypoTermSubset::Nothing, two_typo_subset: NTypoTermSubset::Nothing, + mandatory: false, } } pub fn full(for_term: Interned) -> Self { @@ -143,6 +157,7 @@ impl QueryTermSubset { zero_typo_subset: NTypoTermSubset::All, one_typo_subset: NTypoTermSubset::All, two_typo_subset: NTypoTermSubset::All, + mandatory: false, } } @@ -352,6 +367,28 @@ impl QueryTermSubset { _ => panic!(), } } + pub fn keep_only_exact_term(&mut self, ctx: &SearchContext) { + if let Some(term) = self.exact_term(ctx) { + match term { + ExactTerm::Phrase(p) => { + self.zero_typo_subset = NTypoTermSubset::Subset { + words: BTreeSet::new(), + phrases: BTreeSet::from_iter([p]), + }; + self.clear_one_typo_subset(); + self.clear_two_typo_subset(); + } + ExactTerm::Word(w) => { + self.zero_typo_subset = NTypoTermSubset::Subset { + words: BTreeSet::from_iter([w]), + phrases: BTreeSet::new(), + }; + self.clear_one_typo_subset(); + self.clear_two_typo_subset(); + } + } + } + } pub fn clear_zero_typo_subset(&mut self) { self.zero_typo_subset = NTypoTermSubset::Nothing; } diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 7455a7a17..431eeac30 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -34,7 +34,7 @@ fn compute_docids( } } }; - // TODO: synonyms? + candidates &= universe; Ok(candidates) } @@ -47,18 +47,21 @@ impl RankingRuleGraphTrait for ExactnessGraph { condition: &Self::Condition, universe: &RoaringBitmap, ) -> Result { - let (docids, dest_node) = match condition { + let (docids, end_term_subset) = match condition { ExactnessCondition::ExactInAttribute(dest_node) => { - (compute_docids(ctx, dest_node, universe)?, dest_node) + let mut end_term_subset = dest_node.clone(); + end_term_subset.term_subset.keep_only_exact_term(ctx); + end_term_subset.term_subset.make_mandatory(); + (compute_docids(ctx, dest_node, universe)?, end_term_subset) } - ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node), + ExactnessCondition::Skip(dest_node) => (universe.clone(), dest_node.clone()), }; + Ok(ComputedCondition { docids, universe_len: universe.len(), start_term_subset: None, - // TODO/FIXME: modify `end_term_subset` to signal to the next ranking rules that the term cannot be removed - end_term_subset: dest_node.clone(), + end_term_subset, }) } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 34ed135d4..d16162b1b 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -165,8 +165,8 @@ pub fn compute_query_graph_docids( positions: _, term_ids: _, }) => { - let phrase_docids = compute_query_term_subset_docids(ctx, term_subset)?; - predecessors_docids & phrase_docids + let node_docids = compute_query_term_subset_docids(ctx, term_subset)?; + predecessors_docids & node_docids } QueryNodeData::Deleted => { panic!() diff --git a/milli/src/search/new/tests/exactness.rs b/milli/src/search/new/tests/exactness.rs index f1f4fbe40..2077f6c01 100644 --- a/milli/src/search/new/tests/exactness.rs +++ b/milli/src/search/new/tests/exactness.rs @@ -14,6 +14,11 @@ This module tests the following properties about the exactness ranking rule: 1. those that have an attribute which is equal to the whole remaining query, if this query does not have any "gap" 2. those that have an attribute which start with the whole remaining query, if this query does not have any "gap" 3. those that contain the most exact words from the remaining query + +- if it is followed by other ranking rules, then: + 1. `word` will not remove the exact terms matched by `exactness` + 2. graph-based ranking rules (`typo`, `proximity`, `attribute`) will only work with + (1) the exact terms selected by `exactness` or (2) the full query term otherwise */ use crate::{ @@ -21,7 +26,7 @@ use crate::{ SearchResult, TermsMatchingStrategy, }; -fn create_index_exact_words_simple_ordered() -> TempIndex { +fn create_index_simple_ordered() -> TempIndex { let index = TempIndex::new(); index @@ -80,7 +85,7 @@ fn create_index_exact_words_simple_ordered() -> TempIndex { index } -fn create_index_exact_words_simple_reversed() -> TempIndex { +fn create_index_simple_reversed() -> TempIndex { let index = TempIndex::new(); index @@ -138,7 +143,7 @@ fn create_index_exact_words_simple_reversed() -> TempIndex { index } -fn create_index_exact_words_simple_random() -> TempIndex { +fn create_index_simple_random() -> TempIndex { let index = TempIndex::new(); index @@ -242,9 +247,192 @@ fn create_index_attribute_starts_with() -> TempIndex { index } +fn create_index_simple_ordered_with_typos() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "", + }, + { + "id": 1, + "text": "the", + }, + { + "id": 2, + "text": "the quack", + }, + { + "id": 3, + "text": "the quack briwn", + }, + { + "id": 4, + "text": "the quack briwn fox", + }, + { + "id": 5, + "text": "the quack briwn fox jlmps", + }, + { + "id": 6, + "text": "the quack briwn fox jlmps over", + }, + { + "id": 7, + "text": "the quack briwn fox jlmps over the", + }, + { + "id": 8, + "text": "the quack briwn fox jlmps over the lazy", + }, + { + "id": 9, + "text": "the quack briwn fox jlmps over the lazy dog", + }, + { + "id": 10, + "text": "", + }, + { + "id": 11, + "text": "the", + }, + { + "id": 12, + "text": "the quick", + }, + { + "id": 13, + "text": "the quick brown", + }, + { + "id": 14, + "text": "the quick brown fox", + }, + { + "id": 15, + "text": "the quick brown fox jumps", + }, + + { + "id": 16, + "text": "the quick brown fox jumps over", + }, + { + "id": 17, + "text": "the quick brown fox jumps over the", + }, + { + "id": 18, + "text": "the quick brown fox jumps over the lazy", + }, + { + "id": 19, + "text": "the quick brown fox jumps over the lazy dog", + }, + ])) + .unwrap(); + index +} + +fn create_index_with_varying_proximities() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "lazy jumps dog brown quick the over fox the", + }, + { + "id": 1, + "text": "the quick brown fox jumps over the very lazy dog" + }, + { + "id": 2, + "text": "the quick brown fox jumps over the lazy dog", + }, + { + "id": 3, + "text": "dog brown quick the over fox the lazy", + }, + { + "id": 4, + "text": "the quick brown fox over the very lazy dog" + }, + { + "id": 5, + "text": "the quick brown fox over the lazy dog", + }, + { + "id": 6, + "text": "brown quick the over fox", + }, + { + "id": 7, + "text": "the very quick brown fox over" + }, + { + "id": 8, + "text": "the quick brown fox over", + }, + ])) + .unwrap(); + index +} + +fn create_index_all_equal_except_proximity_between_ignored_terms() -> TempIndex { + let index = TempIndex::new(); + + index + .update_settings(|s| { + s.set_primary_key("id".to_owned()); + s.set_searchable_fields(vec!["text".to_owned()]); + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + index + .add_documents(documents!([ + { + "id": 0, + "text": "lazy jumps dog brown quick the over fox the" + }, + { + "id": 1, + "text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps", + }, + { + "id": 2, + "text": "lazy jumps dog brown quick the over fox the. quack briwn jlmps overt", + }, + ])) + .unwrap(); + index +} + #[test] fn test_exactness_simple_ordered() { - let index = create_index_exact_words_simple_ordered(); + let index = create_index_simple_ordered(); let txn = index.read_txn().unwrap(); @@ -271,7 +459,7 @@ fn test_exactness_simple_ordered() { #[test] fn test_exactness_simple_reversed() { - let index = create_index_exact_words_simple_reversed(); + let index = create_index_simple_reversed(); let txn = index.read_txn().unwrap(); @@ -318,7 +506,7 @@ fn test_exactness_simple_reversed() { #[test] fn test_exactness_simple_random() { - let index = create_index_exact_words_simple_random(); + let index = create_index_simple_random(); let txn = index.read_txn().unwrap(); @@ -377,13 +565,12 @@ fn test_exactness_attribute_starts_with_phrase() { s.terms_matching_strategy(TermsMatchingStrategy::Last); s.query("\"overlooking the sea\" is a beautiful balcony"); let SearchResult { documents_ids, .. } = s.execute().unwrap(); - insta::assert_snapshot!(format!("{documents_ids:?}"), @"[5, 6, 4, 3, 1, 0, 2]"); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: this is incorrect, the first document returned here should actually be the second one insta::assert_debug_snapshot!(texts, @r###" [ - "\"overlooking the sea is a beautiful balcony, I love it\"", "\"overlooking the sea is a beautiful balcony\"", + "\"overlooking the sea is a beautiful balcony, I love it\"", "\"a beautiful balcony is overlooking the sea\"", "\"over looking the sea is a beautiful balcony\"", "\"this balcony is overlooking the sea\"", @@ -398,7 +585,6 @@ fn test_exactness_attribute_starts_with_phrase() { let SearchResult { documents_ids, .. } = s.execute().unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[6, 5, 4, 3, 1, 0, 2, 7]"); let texts = collect_field_values(&index, &txn, "text", &documents_ids); - // TODO: this is correct, so the exactness ranking rule probably has a bug in the handling of phrases insta::assert_debug_snapshot!(texts, @r###" [ "\"overlooking the sea is a beautiful balcony\"", @@ -440,3 +626,148 @@ fn test_exactness_all_candidates_with_typo() { ] "###); } + +#[test] +fn test_exactness_after_words() { + let index = create_index_simple_ordered_with_typos(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Words, Criterion::Exactness]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 9, 18, 8, 17, 16, 6, 7, 15, 5, 14, 4, 13, 3, 12, 2, 1, 11]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quack briwn fox jlmps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quack briwn fox jlmps over the lazy\"", + "\"the quick brown fox jumps over the\"", + "\"the quick brown fox jumps over\"", + "\"the quack briwn fox jlmps over\"", + "\"the quack briwn fox jlmps over the\"", + "\"the quick brown fox jumps\"", + "\"the quack briwn fox jlmps\"", + "\"the quick brown fox\"", + "\"the quack briwn fox\"", + "\"the quick brown\"", + "\"the quack briwn\"", + "\"the quick\"", + "\"the quack\"", + "\"the\"", + "\"the\"", + ] + "###); +} + +#[test] +fn test_words_after_exactness() { + let index = create_index_simple_ordered_with_typos(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Exactness, Criterion::Words]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[19, 18, 16, 17, 9, 15, 8, 14, 6, 7, 13, 5, 4, 12, 3, 2, 1, 11]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the lazy\"", + "\"the quick brown fox jumps over\"", + "\"the quick brown fox jumps over the\"", + "\"the quack briwn fox jlmps over the lazy dog\"", + "\"the quick brown fox jumps\"", + "\"the quack briwn fox jlmps over the lazy\"", + "\"the quick brown fox\"", + "\"the quack briwn fox jlmps over\"", + "\"the quack briwn fox jlmps over the\"", + "\"the quick brown\"", + "\"the quack briwn fox jlmps\"", + "\"the quack briwn fox\"", + "\"the quick\"", + "\"the quack briwn\"", + "\"the quack\"", + "\"the\"", + "\"the\"", + ] + "###); +} + +#[test] +fn test_proximity_after_exactness() { + let index = create_index_with_varying_proximities(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 1, 0, 5, 4, 3, 8, 6, 7]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"the quick brown fox jumps over the lazy dog\"", + "\"the quick brown fox jumps over the very lazy dog\"", + "\"lazy jumps dog brown quick the over fox the\"", + "\"the quick brown fox over the lazy dog\"", + "\"the quick brown fox over the very lazy dog\"", + "\"dog brown quick the over fox the lazy\"", + "\"the quick brown fox over\"", + "\"brown quick the over fox\"", + "\"the very quick brown fox over\"", + ] + "###); + + let index = create_index_all_equal_except_proximity_between_ignored_terms(); + + index + .update_settings(|s| { + s.set_criteria(vec![Criterion::Exactness, Criterion::Words, Criterion::Proximity]); + }) + .unwrap(); + + let txn = index.read_txn().unwrap(); + + let mut s = Search::new(&txn, &index); + s.terms_matching_strategy(TermsMatchingStrategy::Last); + s.query("the quick brown fox jumps over the lazy dog"); + let SearchResult { documents_ids, .. } = s.execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2]"); + let texts = collect_field_values(&index, &txn, "text", &documents_ids); + + insta::assert_debug_snapshot!(texts, @r###" + [ + "\"lazy jumps dog brown quick the over fox the\"", + "\"lazy jumps dog brown quick the over fox the. quack briwn jlmps\"", + "\"lazy jumps dog brown quick the over fox the. quack briwn jlmps overt\"", + ] + "###); +} diff --git a/milli/src/search/new/words.rs b/milli/src/search/new/words.rs index 39bbc823d..5c28f017b 100644 --- a/milli/src/search/new/words.rs +++ b/milli/src/search/new/words.rs @@ -33,7 +33,7 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { &mut self, ctx: &mut SearchContext<'ctx>, _logger: &mut dyn SearchLogger, - _parent_candidates: &RoaringBitmap, + _universe: &RoaringBitmap, parent_query_graph: &QueryGraph, ) -> Result<()> { self.exhausted = false; @@ -77,7 +77,6 @@ impl<'ctx> RankingRule<'ctx, QueryGraph> for Words { let nodes_to_remove = self.nodes_to_remove.pop().unwrap(); query_graph.remove_nodes_keep_edges(&nodes_to_remove.iter().collect::>()); } - Ok(Some(RankingRuleOutput { query: child_query_graph, candidates: this_bucket })) }