diff --git a/milli/src/search/criteria/mod.rs b/milli/src/search/criteria/mod.rs index 98b9d928e..0a5bfd664 100644 --- a/milli/src/search/criteria/mod.rs +++ b/milli/src/search/criteria/mod.rs @@ -17,6 +17,7 @@ use super::query_tree::{Operation, PrimitiveQueryPart, Query, QueryKind}; use super::CriterionImplementationStrategy; use crate::search::criteria::geo::Geo; use crate::search::{word_derivations, Distinct, WordDerivationsCache}; +use crate::update::{MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB}; use crate::{AscDesc as AscDescName, DocumentId, FieldId, Index, Member, Result}; mod asc_desc; @@ -653,14 +654,30 @@ fn query_pair_proximity_docids( match (&left.kind, &right.kind) { (QueryKind::Exact { word: left, .. }, QueryKind::Exact { word: right, .. }) => { if prefix { - match word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { + // There are three distinct cases which we need to distinguish regarding the prefix `right`: + // + // 1. `right` is not in any prefix cache because it is not the prefix of many words + // (and thus, it doesn't have many word derivations) + // 2. `right` is in the prefix cache but cannot be found in the "word prefix pair proximity" databases either + // because it is too long or because the given proximity is too high. + // 3. `right` is in the prefix cache and can be found in the "word prefix pair proximity" databases + // + // The three cases are handled as follows: + // 1. We manually retrieve all the word derivations of `right` and check the `word_pair_proximity` + // database for each of them. + // 2. It would be too expensive to apply the same strategy as (1), therefore, we "disable" the + // proximity ranking rule for the prefixes of the right word. This is done as follows: + // 1. Only find the documents where left is in proximity to the exact (ie non-prefix) right word + // 2. Otherwise, assume that their proximity in all the documents in which they coexist is >= 8 + // + // 3. Query the prefix proximity databases. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; all_word_pair_overall_proximity_docids( ctx, @@ -669,40 +686,91 @@ fn query_pair_proximity_docids( proximity, ) } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in the given proximity to the left word. + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => Ok(word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()), } } else { - Ok(ctx - .word_pair_proximity_docids(left.as_str(), right.as_str(), proximity)? - .unwrap_or_default()) + Ok(word_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default()) } } (QueryKind::Tolerant { typo, word: left }, QueryKind::Exact { word: right, .. }) => { let l_words = word_derivations(left, false, *typo, ctx.words_fst(), wdcache)?.to_owned(); if prefix { - let mut docids = RoaringBitmap::new(); - for (left, _) in l_words { - let current_docids = match word_prefix_pair_overall_proximity_docids( - ctx, - left.as_str(), - right.as_str(), - proximity, - )? { - Some(docids) => Ok(docids), - None => { - let r_words = - word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; - all_word_pair_overall_proximity_docids( + // The logic here is almost identical to the one in the previous match branch. + // The difference is that we fetch the docids for each derivation of the left word. + match ( + ctx.in_prefix_cache(right), + right.len() <= MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB + && proximity <= MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + ) { + // Case 1: not in prefix cache + (false, _) => { + let mut docids = RoaringBitmap::new(); + let r_words = word_derivations(right, true, 0, ctx.words_fst(), wdcache)?; + for (left, _) in l_words { + docids |= all_word_pair_overall_proximity_docids( ctx, &[(left, 0)], r_words, proximity, - ) + )?; } - }?; - docids |= current_docids; + Ok(docids) + } + // Case 2: in prefix cache but either the prefix length or the proximity makes it impossible to + // query the prefix proximity databases. + (true, false) => { + // To "save" the relevancy a little bit, we still find the documents where the + // exact (i.e. non-prefix) right word is in proximity to any derivation of the left word. + let mut candidates = RoaringBitmap::new(); + for (left, _) in l_words { + candidates |= ctx + .word_pair_proximity_docids(&left, right, proximity)? + .unwrap_or_default(); + } + Ok(candidates) + } + // Case 3: in prefix cache, short enough, and proximity is low enough + (true, true) => { + let mut docids = RoaringBitmap::new(); + for (left, _) in l_words { + docids |= word_prefix_pair_overall_proximity_docids( + ctx, + left.as_str(), + right.as_str(), + proximity, + )? + .unwrap_or_default(); + } + Ok(docids) + } } - Ok(docids) } else { all_word_pair_overall_proximity_docids(ctx, &l_words, &[(right, 0)], proximity) } diff --git a/milli/src/search/criteria/proximity.rs b/milli/src/search/criteria/proximity.rs index 2072d0133..20e540f7f 100644 --- a/milli/src/search/criteria/proximity.rs +++ b/milli/src/search/criteria/proximity.rs @@ -590,3 +590,6 @@ fn resolve_plane_sweep_candidates( Ok(candidates) } + +#[cfg(test)] +mod tests {} diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 2dda24172..948811a6b 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -7,7 +7,10 @@ pub use self::index_documents::{ DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; -pub use self::prefix_word_pairs::PrefixWordPairsProximityDocids; +pub use self::prefix_word_pairs::{ + PrefixWordPairsProximityDocids, MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, + MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, +}; pub use self::settings::{Setting, Settings}; pub use self::update_step::UpdateIndexingStep; pub use self::word_prefix_docids::WordPrefixDocids; diff --git a/milli/src/update/prefix_word_pairs/mod.rs b/milli/src/update/prefix_word_pairs/mod.rs index 49874993c..bed542bdb 100644 --- a/milli/src/update/prefix_word_pairs/mod.rs +++ b/milli/src/update/prefix_word_pairs/mod.rs @@ -14,6 +14,9 @@ mod word_prefix; pub use prefix_word::index_prefix_word_database; pub use word_prefix::index_word_prefix_database; +pub const MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB: u8 = 4; +pub const MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB: usize = 2; + pub struct PrefixWordPairsProximityDocids<'t, 'u, 'i> { wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index, @@ -32,31 +35,12 @@ impl<'t, 'u, 'i> PrefixWordPairsProximityDocids<'t, 'u, 'i> { Self { wtxn, index, - max_proximity: 4, - max_prefix_length: 2, + max_proximity: MAX_PROXIMITY_FOR_PREFIX_PROXIMITY_DB, + max_prefix_length: MAX_LENGTH_FOR_PREFIX_PROXIMITY_DB, chunk_compression_type, chunk_compression_level, } } - /// Set the maximum proximity required to make a prefix be part of the words prefixes - /// database. If two words are too far from the threshold the associated documents will - /// not be part of the prefix database. - /// - /// Default value is 4. This value must be lower or equal than 7 and will be clamped - /// to this bound otherwise. - pub fn max_proximity(&mut self, value: u8) -> &mut Self { - self.max_proximity = value.max(7); - self - } - /// Set the maximum length the prefix of a word pair is allowed to have to be part of the words - /// prefixes database. If the prefix length is higher than the threshold, the associated documents - /// will not be part of the prefix database. - /// - /// Default value is 2. - pub fn max_prefix_length(&mut self, value: usize) -> &mut Self { - self.max_prefix_length = value; - self - } #[logging_timer::time("WordPrefixPairProximityDocids::{}")] pub fn execute<'a>(