From 777b387dc4227fca3c2052662ea6d19b0f1a009d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Thu, 8 Dec 2022 11:53:21 +0100 Subject: [PATCH 1/3] Avoid a prefix-related worst-case scenario in the proximity criterion --- milli/src/search/criteria/mod.rs | 124 +++++++++++++++++----- milli/src/search/criteria/proximity.rs | 3 + milli/src/update/mod.rs | 5 +- milli/src/update/prefix_word_pairs/mod.rs | 26 +---- 4 files changed, 108 insertions(+), 50 deletions(-) diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 98b9d928e..0a5bfd664 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -17,6 +17,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use super::CriterionImplementationStrategy; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, Distinct, WordDerivationsCache}; +use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; @@ -653,14 +654,30 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix { - match word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { + // There are three distinct cases which we need to distinguish regarding the prefix `right`: + // + // 1. `right` is not in any prefix cache because it is not the prefix of many words + // (and thus, it doesn't have many word derivations) + // 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either + // because it is too long or because the given proximity is too high. + // 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases + // + // The three cases are handled as follows: + // 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity` + // database for each of them. + // 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the + // proximity ranking rule for the prefixes of the right word. This is done as follows: + // 1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word + // 2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8 + // + // 3. Query the prefix proximity databases. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_overall_proximity_docids( ctx, @@ -669,40 +686,91 @@ fn query_pair_proximity_docids( proximity, ) } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in the given proximity to the left word. + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => Ok(word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()), } } else { - Ok(ctx - .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? - .unwrap_or_default()) + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) } } (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - let current_docids = match word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { - let r_words = - word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids( + // The logic here is almost identical to the one in the previous match branch. + // The difference is that we fetch the docids for each derivation of the left word. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { + let mut docids = RoaringBitmap::new(); + let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; + for (left, _) in l_words { + docids |= all_word_pair_overall_proximity_docids( ctx, &[(left, 0)], r_words, proximity, - ) + )?; } - }?; - docids |= current_docids; + Ok(docids) + } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in proximity to any derivation of the left word. + let mut candidates = RoaringBitmap::new(); + for (left, _) in l_words { + candidates |= ctx + .word_pair_proximity_docids(&left, right, proximity)? + .unwrap_or_default(); + } + Ok(candidates) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + docids |= word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default(); + } + Ok(docids) + } } - Ok(docids) } else { all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 2072d0133..20e540f7f 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -590,3 +590,6 @@ fn resolve_plane_sweep_candidates( Ok(candidates) } + +#[cfg(test)] +mod tests {} diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 2dda24172..948811a6b 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -7,7 +7,10 @@ pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; -pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids; +pub use self::prefix_word_pairs::{ + PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, + MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, +}; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 49874993c..bed542bdb 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -14,6 +14,9 @@ mod word_prefix; pub use prefix_word::index_prefix_word_database; pub use word_prefix::index_word_prefix_database; +pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; +pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; + pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, @@ -32,31 +35,12 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { Self { wtxn, index, - max_proximity: 4, - max_prefix_length: 2, + max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, chunk_compression_type, chunk_compression_level, } } - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute<'a>( From f097aafa1c79ac52da56c717c4bc8a668772229d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 28 Nov 2022 16:10:17 +0100 Subject: [PATCH 2/3] Add unit test for prefix handling by the proximity criterion --- milli/src/search/criteria/proximity.rs | 99 +++++++++++++++++++++++++- 1 file changed, 98 insertions(+), 1 deletion(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 20e540f7f..160c4908b 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -592,4 +592,101 @@ fn resolve_plane_sweep_candidates( } #[cfg(test)] -mod tests {} +mod tests { + use std::io::Cursor; + + use big_s::S; + + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; + use crate::index::tests::TempIndex; + use crate::SearchResult; + + fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { + let mut documents = Vec::new(); + for prefix in prefixes { + for i in 0..500 { + documents.push( + serde_json::json!({ + "text": format!("{prefix}{i:x}"), + }) + .as_object() + .unwrap() + .clone(), + ) + } + } + documents + } + + #[test] + fn test_proximity_criterion_prefix_handling() { + let mut index = TempIndex::new(); + index.index_documents_config.autogenerate_docids = true; + + index + .update_settings(|settings| { + settings.set_primary_key(S("id")); + settings.set_criteria(vec![ + "words".to_owned(), + "typo".to_owned(), + "proximity".to_owned(), + ]); + }) + .unwrap(); + + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + + for doc in [ + // 0 + serde_json::json!({ "text": "zero is exactly the amount of configuration I want" }), + // 1 + serde_json::json!({ "text": "zero bad configuration" }), + // 2 + serde_json::json!({ "text": "zero configuration" }), + // 3 + serde_json::json!({ "text": "zero config" }), + // 4 + serde_json::json!({ "text": "zero conf" }), + // 5 + serde_json::json!({ "text": "zero bad conf" }), + ] { + documents.append_json_object(doc.as_object().unwrap()).unwrap(); + } + for doc in documents_with_enough_different_words_for_prefixes(&["conf"]) { + documents.append_json_object(&doc).unwrap(); + } + let documents = + DocumentsBatchReader::from_reader(Cursor::new(documents.into_inner().unwrap())) + .unwrap(); + + index.add_documents(documents).unwrap(); + + let rtxn = index.read_txn().unwrap(); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero c").execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero co").execute().unwrap(); + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero con").execute().unwrap(); + // Here searh results are degraded because `con` is in the prefix cache but it is too + // long to be stored in the prefix proximity databases, and we don't want to iterate over + // all of its word derivations + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero conf").execute().unwrap(); + // Here search results are degraded as well, but we can still rank correctly documents + // that contain `conf` exactly, and not as a prefix. + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]"); + + let SearchResult { matching_words: _, candidates: _, documents_ids } = + index.search(&rtxn).query("zero config").execute().unwrap(); + // `config` is not a common prefix, so the normal methods are used + insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]"); + } +} From 8d36570958070e0bbf8f185691f6f20d5a125db9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 2 Jan 2023 10:37:01 +0100 Subject: [PATCH 3/3] Add explicit criterion impl strategy to proximity search tests --- milli/src/search/criteria/proximity.rs | 42 +++++++++++++++++++------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 160c4908b..1d86f4da1 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -599,7 +599,7 @@ mod tests { use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; - use crate::SearchResult; + use crate::{CriterionImplementationStrategy, SearchResult}; fn documents_with_enough_different_words_for_prefixes(prefixes: &[&str]) -> Vec { let mut documents = Vec::new(); @@ -663,29 +663,49 @@ mod tests { let rtxn = index.read_txn().unwrap(); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero c").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero c") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero co").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero co") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 4, 1, 5, 0]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero con").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .query("zero con") + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .execute() + .unwrap(); // Here searh results are degraded because `con` is in the prefix cache but it is too // long to be stored in the prefix proximity databases, and we don't want to iterate over // all of its word derivations insta::assert_snapshot!(format!("{documents_ids:?}"), @"[0, 1, 2, 3, 4, 5]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero conf").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .query("zero conf") + .execute() + .unwrap(); // Here search results are degraded as well, but we can still rank correctly documents // that contain `conf` exactly, and not as a prefix. insta::assert_snapshot!(format!("{documents_ids:?}"), @"[4, 5, 0, 1, 2, 3]"); - let SearchResult { matching_words: _, candidates: _, documents_ids } = - index.search(&rtxn).query("zero config").execute().unwrap(); + let SearchResult { matching_words: _, candidates: _, documents_ids } = index + .search(&rtxn) + .criterion_implementation_strategy(CriterionImplementationStrategy::OnlySetBased) + .query("zero config") + .execute() + .unwrap(); // `config` is not a common prefix, so the normal methods are used insta::assert_snapshot!(format!("{documents_ids:?}"), @"[2, 3, 1, 0, 4, 5]"); }