From 50a7393c554333de51494b59834845b0fb51dc3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 21 Jun 2024 11:43:04 +0200 Subject: [PATCH 1/9] Modify the compute_query_term_subset_docids function to accept the universe --- milli/src/search/new/query_graph.rs | 2 +- .../search/new/ranking_rule_graph/exactness/mod.rs | 2 +- .../ranking_rule_graph/proximity/compute_docids.rs | 4 +--- milli/src/search/new/ranking_rule_graph/typo/mod.rs | 3 +-- milli/src/search/new/ranking_rule_graph/words/mod.rs | 3 +-- milli/src/search/new/resolve_query_graph.rs | 11 ++++++----- 6 files changed, 11 insertions(+), 14 deletions(-) diff --git a/milli/src/search/new/query_graph.rs b/milli/src/search/new/query_graph.rs index 40e324a7e..9ab5d9dad 100644 --- a/milli/src/search/new/query_graph.rs +++ b/milli/src/search/new/query_graph.rs @@ -302,7 +302,7 @@ impl QueryGraph { for (_, node) in self.nodes.iter() { match &node.data { QueryNodeData::Term(t) => { - let docids = compute_query_term_subset_docids(ctx, &t.term_subset)?; + let docids = compute_query_term_subset_docids(ctx, None, &t.term_subset)?; for id in t.term_ids.clone() { term_docids .entry(id) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index d3fa040ee..274688877 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -59,7 +59,7 @@ impl RankingRuleGraphTrait for ExactnessGraph { } ExactnessCondition::Any(dest_node) => { let docids = - universe & compute_query_term_subset_docids(ctx, &dest_node.term_subset)?; + compute_query_term_subset_docids(ctx, Some(universe), &dest_node.term_subset)?; (docids, dest_node.clone()) } }; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index b348c682d..e73e5e6aa 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -22,10 +22,8 @@ pub fn compute_docids( (left_term, right_term, *cost) } ProximityCondition::Term { term } => { - let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; - docids &= universe; return Ok(ComputedCondition { - docids, + docids: compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?, universe_len: universe.len(), start_term_subset: None, end_term_subset: term.clone(), diff --git a/milli/src/search/new/ranking_rule_graph/typo/mod.rs b/milli/src/search/new/ranking_rule_graph/typo/mod.rs index c1256f6a1..b19163ba2 100644 --- a/milli/src/search/new/ranking_rule_graph/typo/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/typo/mod.rs @@ -27,8 +27,7 @@ impl RankingRuleGraphTrait for TypoGraph { ) -> Result { let TypoCondition { term, .. } = condition; // maybe compute_query_term_subset_docids should accept a universe as argument - let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; - docids &= universe; + let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?; Ok(ComputedCondition { docids, diff --git a/milli/src/search/new/ranking_rule_graph/words/mod.rs b/milli/src/search/new/ranking_rule_graph/words/mod.rs index bb6baf669..19b54e76f 100644 --- a/milli/src/search/new/ranking_rule_graph/words/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/words/mod.rs @@ -26,8 +26,7 @@ impl RankingRuleGraphTrait for WordsGraph { ) -> Result { let WordsCondition { term, .. } = condition; // maybe compute_query_term_subset_docids should accept a universe as argument - let mut docids = compute_query_term_subset_docids(ctx, &term.term_subset)?; - docids &= universe; + let docids = compute_query_term_subset_docids(ctx, Some(universe), &term.term_subset)?; Ok(ComputedCondition { docids, diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 8d89e8275..3f24319f1 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -31,6 +31,7 @@ impl<'ctx> SearchContext<'ctx> { } pub fn compute_query_term_subset_docids( ctx: &mut SearchContext<'_>, + universe: Option<&RoaringBitmap>, term: &QueryTermSubset, ) -> Result { let mut docids = RoaringBitmap::new(); @@ -49,7 +50,10 @@ pub fn compute_query_term_subset_docids( } } - Ok(docids) + match universe { + Some(universe) => Ok(docids & universe), + None => Ok(docids), + } } pub fn compute_query_term_subset_docids_within_field_id( @@ -147,10 +151,7 @@ pub fn compute_query_graph_docids( term_subset, positions: _, term_ids: _, - }) => { - let node_docids = compute_query_term_subset_docids(ctx, term_subset)?; - predecessors_docids & node_docids - } + }) => compute_query_term_subset_docids(ctx, Some(&predecessors_docids), term_subset)?, QueryNodeData::Deleted => { panic!() } From 0ca1a4e8053f4a7c56894846c6cc19fcba153c8d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 21 Jun 2024 14:26:05 +0200 Subject: [PATCH 2/9] Always do the intersections with the universe --- milli/src/search/new/db_cache.rs | 183 ++++++++++++------ milli/src/search/new/exact_attribute.rs | 11 +- milli/src/search/new/mod.rs | 10 +- .../new/query_term/compute_derivations.rs | 2 +- .../new/ranking_rule_graph/exactness/mod.rs | 13 +- .../search/new/ranking_rule_graph/fid/mod.rs | 8 +- .../new/ranking_rule_graph/position/mod.rs | 1 + .../proximity/compute_docids.rs | 37 ++-- milli/src/search/new/resolve_query_graph.rs | 43 ++-- 9 files changed, 201 insertions(+), 107 deletions(-) diff --git a/milli/src/search/new/db_cache.rs b/milli/src/search/new/db_cache.rs index 95f8e7557..d33058af1 100644 --- a/milli/src/search/new/db_cache.rs +++ b/milli/src/search/new/db_cache.rs @@ -46,36 +46,70 @@ pub struct DatabaseCache<'ctx> { pub word_prefix_fids: FxHashMap, Vec>, } impl<'ctx> DatabaseCache<'ctx> { - fn get_value<'v, K1, KC, DC>( + fn get_value<'v, K1, KC>( txn: &'ctx RoTxn<'_>, cache_key: K1, db_key: &'v KC::EItem, cache: &mut FxHashMap>>, + universe: Option<&RoaringBitmap>, db: Database, - ) -> Result> + ) -> Result> where K1: Copy + Eq + Hash, KC: BytesEncode<'v>, - DC: BytesDecodeOwned, { if let Entry::Vacant(entry) = cache.entry(cache_key) { let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed); entry.insert(bitmap_ptr); } - match cache.get(&cache_key).unwrap() { - Some(Cow::Borrowed(bytes)) => DC::bytes_decode_owned(bytes) + let bitmap_bytes = match cache.get(&cache_key).unwrap() { + Some(Cow::Borrowed(bytes)) => bytes, + Some(Cow::Owned(bytes)) => bytes.as_slice(), + None => return Ok(None), + }; + + match (bitmap_bytes, universe) { + (bytes, Some(universe)) => { + CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe) + .map(Some) + .map_err(Into::into) + } + (bytes, None) => CboRoaringBitmapCodec::bytes_decode_owned(bytes) .map(Some) .map_err(heed::Error::Decoding) .map_err(Into::into), - Some(Cow::Owned(bytes)) => DC::bytes_decode_owned(bytes) - .map(Some) - .map_err(heed::Error::Decoding) - .map_err(Into::into), - None => Ok(None), } } + fn get_value_length<'v, K1, KC>( + txn: &'ctx RoTxn<'_>, + cache_key: K1, + db_key: &'v KC::EItem, + cache: &mut FxHashMap>>, + db: Database, + ) -> Result> + where + K1: Copy + Eq + Hash, + KC: BytesEncode<'v>, + { + if let Entry::Vacant(entry) = cache.entry(cache_key) { + let bitmap_ptr = db.get(txn, db_key)?.map(Cow::Borrowed); + entry.insert(bitmap_ptr); + } + + let bitmap_bytes = match cache.get(&cache_key).unwrap() { + Some(Cow::Borrowed(bytes)) => bytes, + Some(Cow::Owned(bytes)) => bytes.as_slice(), + None => return Ok(None), + }; + + CboRoaringBitmapLenCodec::bytes_decode_owned(bitmap_bytes) + .map(Some) + .map_err(heed::Error::Decoding) + .map_err(Into::into) + } + fn get_value_from_keys<'v, K1, KC, DC>( txn: &'ctx RoTxn<'_>, cache_key: K1, @@ -137,11 +171,15 @@ impl<'ctx> SearchContext<'ctx> { } } - pub fn word_docids(&mut self, word: Word) -> Result> { + pub fn word_docids( + &mut self, + universe: Option<&RoaringBitmap>, + word: Word, + ) -> Result> { match word { Word::Original(word) => { - let exact = self.get_db_exact_word_docids(word)?; - let tolerant = self.get_db_word_docids(word)?; + let exact = self.get_db_exact_word_docids(universe, word)?; + let tolerant = self.get_db_word_docids(universe, word)?; Ok(match (exact, tolerant) { (None, None) => None, (None, Some(tolerant)) => Some(tolerant), @@ -153,12 +191,16 @@ impl<'ctx> SearchContext<'ctx> { } }) } - Word::Derived(word) => self.get_db_word_docids(word), + Word::Derived(word) => self.get_db_word_docids(universe, word), } } /// Retrieve or insert the given value in the `word_docids` database. - fn get_db_word_docids(&mut self, word: Interned) -> Result> { + fn get_db_word_docids( + &mut self, + universe: Option<&RoaringBitmap>, + word: Interned, + ) -> Result> { match &self.restricted_fids { Some(restricted_fids) => { let interned = self.word_interner.get(word).as_str(); @@ -174,11 +216,12 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _>( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.word_docids, + universe, self.index.word_docids.remap_data_type::(), ), } @@ -186,6 +229,7 @@ impl<'ctx> SearchContext<'ctx> { fn get_db_exact_word_docids( &mut self, + universe: Option<&RoaringBitmap>, word: Interned, ) -> Result> { match &self.restricted_fids { @@ -203,21 +247,26 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _>( self.txn, word, self.word_interner.get(word).as_str(), &mut self.db_cache.exact_word_docids, + universe, self.index.exact_word_docids.remap_data_type::(), ), } } - pub fn word_prefix_docids(&mut self, prefix: Word) -> Result> { + pub fn word_prefix_docids( + &mut self, + universe: Option<&RoaringBitmap>, + prefix: Word, + ) -> Result> { match prefix { Word::Original(prefix) => { - let exact = self.get_db_exact_word_prefix_docids(prefix)?; - let tolerant = self.get_db_word_prefix_docids(prefix)?; + let exact = self.get_db_exact_word_prefix_docids(universe, prefix)?; + let tolerant = self.get_db_word_prefix_docids(universe, prefix)?; Ok(match (exact, tolerant) { (None, None) => None, (None, Some(tolerant)) => Some(tolerant), @@ -229,13 +278,14 @@ impl<'ctx> SearchContext<'ctx> { } }) } - Word::Derived(prefix) => self.get_db_word_prefix_docids(prefix), + Word::Derived(prefix) => self.get_db_word_prefix_docids(universe, prefix), } } /// Retrieve or insert the given value in the `word_prefix_docids` database. fn get_db_word_prefix_docids( &mut self, + universe: Option<&RoaringBitmap>, prefix: Interned, ) -> Result> { match &self.restricted_fids { @@ -253,11 +303,12 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _>( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.word_prefix_docids, + universe, self.index.word_prefix_docids.remap_data_type::(), ), } @@ -265,6 +316,7 @@ impl<'ctx> SearchContext<'ctx> { fn get_db_exact_word_prefix_docids( &mut self, + universe: Option<&RoaringBitmap>, prefix: Interned, ) -> Result> { match &self.restricted_fids { @@ -282,11 +334,12 @@ impl<'ctx> SearchContext<'ctx> { merge_cbo_roaring_bitmaps, ) } - None => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + None => DatabaseCache::get_value::<_, _>( self.txn, prefix, self.word_interner.get(prefix).as_str(), &mut self.db_cache.exact_word_prefix_docids, + universe, self.index.exact_word_prefix_docids.remap_data_type::(), ), } @@ -294,6 +347,7 @@ impl<'ctx> SearchContext<'ctx> { pub fn get_db_word_pair_proximity_docids( &mut self, + universe: Option<&RoaringBitmap>, word1: Interned, word2: Interned, proximity: u8, @@ -320,8 +374,8 @@ impl<'ctx> SearchContext<'ctx> { for fid in fids { // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. - let word1_docids = self.get_db_word_fid_docids(word1, fid)?; - let word2_docids = self.get_db_word_fid_docids(word2, fid)?; + let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?; + let word2_docids = self.get_db_word_fid_docids(universe, word2, fid)?; if let (Some(word1_docids), Some(word2_docids)) = (word1_docids, word2_docids) { @@ -341,7 +395,33 @@ impl<'ctx> SearchContext<'ctx> { Ok(docids) } - ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + ProximityPrecision::ByWord => DatabaseCache::get_value::<_, _>( + self.txn, + (proximity, word1, word2), + &( + proximity, + self.word_interner.get(word1).as_str(), + self.word_interner.get(word2).as_str(), + ), + &mut self.db_cache.word_pair_proximity_docids, + universe, + self.index.word_pair_proximity_docids.remap_data_type::(), + ), + } + } + + pub fn get_db_word_pair_proximity_docids_len( + &mut self, + universe: Option<&RoaringBitmap>, + word1: Interned, + word2: Interned, + proximity: u8, + ) -> Result> { + match self.index.proximity_precision(self.txn)?.unwrap_or_default() { + ProximityPrecision::ByAttribute => Ok(self + .get_db_word_pair_proximity_docids(universe, word1, word2, proximity)? + .map(|d| d.len())), + ProximityPrecision::ByWord => DatabaseCache::get_value_length::<_, _>( self.txn, (proximity, word1, word2), &( @@ -355,34 +435,9 @@ impl<'ctx> SearchContext<'ctx> { } } - pub fn get_db_word_pair_proximity_docids_len( - &mut self, - word1: Interned, - word2: Interned, - proximity: u8, - ) -> Result> { - match self.index.proximity_precision(self.txn)?.unwrap_or_default() { - ProximityPrecision::ByAttribute => Ok(self - .get_db_word_pair_proximity_docids(word1, word2, proximity)? - .map(|d| d.len())), - ProximityPrecision::ByWord => { - DatabaseCache::get_value::<_, _, CboRoaringBitmapLenCodec>( - self.txn, - (proximity, word1, word2), - &( - proximity, - self.word_interner.get(word1).as_str(), - self.word_interner.get(word2).as_str(), - ), - &mut self.db_cache.word_pair_proximity_docids, - self.index.word_pair_proximity_docids.remap_data_type::(), - ) - } - } - } - pub fn get_db_word_prefix_pair_proximity_docids( &mut self, + universe: Option<&RoaringBitmap>, word1: Interned, prefix2: Interned, mut proximity: u8, @@ -409,8 +464,9 @@ impl<'ctx> SearchContext<'ctx> { // for each field, intersect left word bitmap and right word bitmap, // then merge the result in a global bitmap before storing it in the cache. for fid in fids { - let word1_docids = self.get_db_word_fid_docids(word1, fid)?; - let prefix2_docids = self.get_db_word_prefix_fid_docids(prefix2, fid)?; + let word1_docids = self.get_db_word_fid_docids(universe, word1, fid)?; + let prefix2_docids = + self.get_db_word_prefix_fid_docids(universe, prefix2, fid)?; if let (Some(word1_docids), Some(prefix2_docids)) = (word1_docids, prefix2_docids) { @@ -452,16 +508,18 @@ impl<'ctx> SearchContext<'ctx> { pub fn get_db_prefix_word_pair_proximity_docids( &mut self, + universe: Option<&RoaringBitmap>, left_prefix: Interned, right: Interned, proximity: u8, ) -> Result> { // only accept exact matches on reverted positions - self.get_db_word_pair_proximity_docids(left_prefix, right, proximity) + self.get_db_word_pair_proximity_docids(universe, left_prefix, right, proximity) } pub fn get_db_word_fid_docids( &mut self, + universe: Option<&RoaringBitmap>, word: Interned, fid: u16, ) -> Result> { @@ -470,17 +528,19 @@ impl<'ctx> SearchContext<'ctx> { return Ok(None); } - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value::<_, _>( self.txn, (word, fid), &(self.word_interner.get(word).as_str(), fid), &mut self.db_cache.word_fid_docids, + universe, self.index.word_fid_docids.remap_data_type::(), ) } pub fn get_db_word_prefix_fid_docids( &mut self, + universe: Option<&RoaringBitmap>, word_prefix: Interned, fid: u16, ) -> Result> { @@ -489,11 +549,12 @@ impl<'ctx> SearchContext<'ctx> { return Ok(None); } - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value::<_, _>( self.txn, (word_prefix, fid), &(self.word_interner.get(word_prefix).as_str(), fid), &mut self.db_cache.word_prefix_fid_docids, + universe, self.index.word_prefix_fid_docids.remap_data_type::(), ) } @@ -554,28 +615,32 @@ impl<'ctx> SearchContext<'ctx> { pub fn get_db_word_position_docids( &mut self, + universe: Option<&RoaringBitmap>, word: Interned, position: u16, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value::<_, _>( self.txn, (word, position), &(self.word_interner.get(word).as_str(), position), &mut self.db_cache.word_position_docids, + universe, self.index.word_position_docids.remap_data_type::(), ) } pub fn get_db_word_prefix_position_docids( &mut self, + universe: Option<&RoaringBitmap>, word_prefix: Interned, position: u16, ) -> Result> { - DatabaseCache::get_value::<_, _, CboRoaringBitmapCodec>( + DatabaseCache::get_value::<_, _>( self.txn, (word_prefix, position), &(self.word_interner.get(word_prefix).as_str(), position), &mut self.db_cache.word_prefix_position_docids, + universe, self.index.word_prefix_position_docids.remap_data_type::(), ) } diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index d270c4847..52efa63a4 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -171,9 +171,10 @@ impl State { // Note: Since the position is stored bucketed in word_position_docids, for queries with a lot of // longer phrases we'll be losing on precision here. let bucketed_position = crate::bucketed_position(position + offset); - let word_position_docids = - ctx.get_db_word_position_docids(*word, bucketed_position)?.unwrap_or_default() - & universe; + let word_position_docids = ctx + .get_db_word_position_docids(Some(universe), *word, bucketed_position)? + .unwrap_or_default() + & universe; candidates &= word_position_docids; if candidates.is_empty() { return Ok(State::Empty(query_graph.clone())); @@ -199,7 +200,9 @@ impl State { // ignore stop words words in phrases .flatten() .map(|word| -> Result<_> { - Ok(ctx.get_db_word_fid_docids(*word, fid)?.unwrap_or_default()) + Ok(ctx + .get_db_word_fid_docids(Some(universe), *word, fid)? + .unwrap_or_default()) }), )?; intersection &= &candidates; diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 208e1b428..73aaa330f 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -232,11 +232,12 @@ fn resolve_universe( #[tracing::instrument(level = "trace", skip_all, target = "search::query")] fn resolve_negative_words( ctx: &mut SearchContext<'_>, + universe: Option<&RoaringBitmap>, negative_words: &[Word], ) -> Result { let mut negative_bitmap = RoaringBitmap::new(); for &word in negative_words { - if let Some(bitmap) = ctx.word_docids(word)? { + if let Some(bitmap) = ctx.word_docids(universe, word)? { negative_bitmap |= bitmap; } } @@ -246,13 +247,14 @@ fn resolve_negative_words( #[tracing::instrument(level = "trace", skip_all, target = "search::query")] fn resolve_negative_phrases( ctx: &mut SearchContext<'_>, + universe: Option<&RoaringBitmap>, negative_phrases: &[LocatedQueryTerm], ) -> Result { let mut negative_bitmap = RoaringBitmap::new(); for term in negative_phrases { let query_term = ctx.term_interner.get(term.value); if let Some(phrase) = query_term.original_phrase() { - negative_bitmap |= ctx.get_phrase_docids(phrase)?; + negative_bitmap |= ctx.get_phrase_docids(universe, phrase)?; } } Ok(negative_bitmap) @@ -686,8 +688,8 @@ pub fn execute_search( located_query_terms_from_tokens(ctx, tokens, words_limit)?; used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty(); - let ignored_documents = resolve_negative_words(ctx, &negative_words)?; - let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?; + let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?; + let ignored_phrases = resolve_negative_phrases(ctx, Some(&universe), &negative_phrases)?; universe -= ignored_documents; universe -= ignored_phrases; diff --git a/milli/src/search/new/query_term/compute_derivations.rs b/milli/src/search/new/query_term/compute_derivations.rs index c1783431c..e2a136328 100644 --- a/milli/src/search/new/query_term/compute_derivations.rs +++ b/milli/src/search/new/query_term/compute_derivations.rs @@ -417,7 +417,7 @@ fn split_best_frequency( let left = ctx.word_interner.insert(left.to_owned()); let right = ctx.word_interner.insert(right.to_owned()); - if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(left, right, 1)? { + if let Some(frequency) = ctx.get_db_word_pair_proximity_docids_len(None, left, right, 1)? { if best.map_or(true, |(old, _, _)| frequency > old) { best = Some((frequency, left, right)); } diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 274688877..ae3a582bb 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -26,18 +26,15 @@ fn compute_docids( } else { return Ok(Default::default()); }; - let mut candidates = match exact_term { - ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)?.clone(), + + let candidates = match exact_term { + // TODO I move the intersection here + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(Some(universe), phrase)? & universe, ExactTerm::Word(word) => { - if let Some(word_candidates) = ctx.word_docids(Word::Original(word))? { - word_candidates - } else { - return Ok(Default::default()); - } + ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default() } }; - candidates &= universe; Ok(candidates) } diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index 2019eb380..ad4a83dc3 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -30,8 +30,12 @@ impl RankingRuleGraphTrait for FidGraph { let docids = if let Some(fid) = condition.fid { // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let docids = - compute_query_term_subset_docids_within_field_id(ctx, &term.term_subset, fid)?; + let docids = compute_query_term_subset_docids_within_field_id( + ctx, + Some(universe), + &term.term_subset, + fid, + )?; docids & universe } else { RoaringBitmap::new() diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index 567aad550..fd61c1dcc 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -33,6 +33,7 @@ impl RankingRuleGraphTrait for PositionGraph { docids |= universe & compute_query_term_subset_docids_within_position( ctx, + Some(universe), &term.term_subset, *position, )?; diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index e73e5e6aa..29459cd99 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -74,10 +74,10 @@ pub fn compute_docids( if right_derivs.len() > 1 { let universe = &universe; if let Some(left_phrase) = left_phrase { - if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { + if universe.is_disjoint(ctx.get_phrase_docids(Some(universe), left_phrase)?) { continue; } - } else if let Some(left_word_docids) = ctx.word_docids(left_word)? { + } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? { if universe.is_disjoint(&left_word_docids) { continue; } @@ -123,7 +123,10 @@ fn compute_prefix_edges( let mut universe = universe.clone(); if let Some(phrase) = left_phrase { - let phrase_docids = ctx.get_phrase_docids(phrase)?; + // TODO we can clearly give the universe to this method + // Unfortunately, it is deserializing/computing stuff and + // keeping the result as a materialized bitmap. + let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?; if !phrase_docids.is_empty() { used_left_phrases.insert(phrase); } @@ -133,9 +136,13 @@ fn compute_prefix_edges( } } - if let Some(new_docids) = - ctx.get_db_word_prefix_pair_proximity_docids(left_word, right_prefix, forward_proximity)? - { + // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff. + if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids( + Some(&universe), + left_word, + right_prefix, + forward_proximity, + )? { let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); @@ -147,6 +154,7 @@ fn compute_prefix_edges( // No swapping when computing the proximity between a phrase and a word if left_phrase.is_none() { if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( + Some(&universe), right_prefix, left_word, backward_proximity, @@ -177,26 +185,29 @@ fn compute_non_prefix_edges( let mut universe = universe.clone(); for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { - let phrase_docids = ctx.get_phrase_docids(phrase)?; + // TODO do the intersection in the method, again! + let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?; universe &= phrase_docids; if universe.is_empty() { return Ok(()); } } + // TODO check that it is not an issue to alterate the universe if let Some(new_docids) = - ctx.get_db_word_pair_proximity_docids(word1, word2, forward_proximity)? + ctx.get_db_word_pair_proximity_docids(Some(&universe), word1, word2, forward_proximity)? { - let new_docids = &universe & new_docids; if !new_docids.is_empty() { *docids |= new_docids; } } if backward_proximity >= 1 && left_phrase.is_none() && right_phrase.is_none() { - if let Some(new_docids) = - ctx.get_db_word_pair_proximity_docids(word2, word1, backward_proximity)? - { - let new_docids = &universe & new_docids; + if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids( + Some(&universe), + word2, + word1, + backward_proximity, + )? { if !new_docids.is_empty() { *docids |= new_docids; } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 3f24319f1..6cab2f364 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -19,11 +19,16 @@ pub struct PhraseDocIdsCache { } impl<'ctx> SearchContext<'ctx> { /// Get the document ids associated with the given phrase - pub fn get_phrase_docids(&mut self, phrase: Interned) -> Result<&RoaringBitmap> { + pub fn get_phrase_docids( + &mut self, + universe: Option<&RoaringBitmap>, + phrase: Interned, + ) -> Result<&RoaringBitmap> { if self.phrase_docids.cache.contains_key(&phrase) { return Ok(&self.phrase_docids.cache[&phrase]); }; - let docids = compute_phrase_docids(self, phrase)?; + let docids = compute_phrase_docids(self, universe, phrase)?; + // TODO can we improve that? Because there is an issue, we keep that in cache... let _ = self.phrase_docids.cache.insert(phrase, docids); let docids = &self.phrase_docids.cache[&phrase]; Ok(docids) @@ -35,17 +40,18 @@ pub fn compute_query_term_subset_docids( term: &QueryTermSubset, ) -> Result { let mut docids = RoaringBitmap::new(); + // TODO use the MultiOps trait to do large intersections for word in term.all_single_words_except_prefix_db(ctx)? { - if let Some(word_docids) = ctx.word_docids(word)? { + if let Some(word_docids) = ctx.word_docids(universe, word)? { docids |= word_docids; } } for phrase in term.all_phrases(ctx)? { - docids |= ctx.get_phrase_docids(phrase)?; + docids |= ctx.get_phrase_docids(universe, phrase)?; } if let Some(prefix) = term.use_prefix_db(ctx) { - if let Some(prefix_docids) = ctx.word_prefix_docids(prefix)? { + if let Some(prefix_docids) = ctx.word_prefix_docids(universe, prefix)? { docids |= prefix_docids; } } @@ -58,12 +64,13 @@ pub fn compute_query_term_subset_docids( pub fn compute_query_term_subset_docids_within_field_id( ctx: &mut SearchContext<'_>, + universe: Option<&RoaringBitmap>, term: &QueryTermSubset, fid: u16, ) -> Result { let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { - if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(word.interned(), fid)? { + if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, word.interned(), fid)? { docids |= word_fid_docids; } } @@ -72,15 +79,15 @@ pub fn compute_query_term_subset_docids_within_field_id( // There may be false positives when resolving a phrase, so we're not // guaranteed that all of its words are within a single fid. if let Some(word) = phrase.words(ctx).iter().flatten().next() { - if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(*word, fid)? { - docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; + if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? { + docids |= ctx.get_phrase_docids(Some(&word_fid_docids), phrase)?; } } } if let Some(word_prefix) = term.use_prefix_db(ctx) { if let Some(word_fid_docids) = - ctx.get_db_word_prefix_fid_docids(word_prefix.interned(), fid)? + ctx.get_db_word_prefix_fid_docids(universe, word_prefix.interned(), fid)? { docids |= word_fid_docids; } @@ -91,13 +98,14 @@ pub fn compute_query_term_subset_docids_within_field_id( pub fn compute_query_term_subset_docids_within_position( ctx: &mut SearchContext<'_>, + universe: Option<&RoaringBitmap>, term: &QueryTermSubset, position: u16, ) -> Result { let mut docids = RoaringBitmap::new(); for word in term.all_single_words_except_prefix_db(ctx)? { if let Some(word_position_docids) = - ctx.get_db_word_position_docids(word.interned(), position)? + ctx.get_db_word_position_docids(universe, word.interned(), position)? { docids |= word_position_docids; } @@ -107,15 +115,17 @@ pub fn compute_query_term_subset_docids_within_position( // It's difficult to know the expected position of the words in the phrase, // so instead we just check the first one. if let Some(word) = phrase.words(ctx).iter().flatten().next() { - if let Some(word_position_docids) = ctx.get_db_word_position_docids(*word, position)? { - docids |= ctx.get_phrase_docids(phrase)? & word_position_docids + if let Some(word_position_docids) = + ctx.get_db_word_position_docids(universe, *word, position)? + { + docids |= ctx.get_phrase_docids(Some(&word_position_docids), phrase)?; } } } if let Some(word_prefix) = term.use_prefix_db(ctx) { if let Some(word_position_docids) = - ctx.get_db_word_prefix_position_docids(word_prefix.interned(), position)? + ctx.get_db_word_prefix_position_docids(universe, word_prefix.interned(), position)? { docids |= word_position_docids; } @@ -180,6 +190,7 @@ pub fn compute_query_graph_docids( pub fn compute_phrase_docids( ctx: &mut SearchContext<'_>, + universe: Option<&RoaringBitmap>, phrase: Interned, ) -> Result { let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); @@ -189,7 +200,7 @@ pub fn compute_phrase_docids( } let mut candidates = RoaringBitmap::new(); for word in words.iter().flatten().copied() { - if let Some(word_docids) = ctx.word_docids(Word::Original(word))? { + if let Some(word_docids) = ctx.word_docids(universe, Word::Original(word))? { candidates |= word_docids; } else { return Ok(RoaringBitmap::new()); @@ -213,7 +224,7 @@ pub fn compute_phrase_docids( .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { if dist == 0 { - match ctx.get_db_word_pair_proximity_docids(s1, s2, 1)? { + match ctx.get_db_word_pair_proximity_docids(universe, s1, s2, 1)? { Some(m) => bitmaps.push(m), // If there are no documents for this pair, there will be no // results for the phrase query. @@ -223,7 +234,7 @@ pub fn compute_phrase_docids( let mut bitmap = RoaringBitmap::new(); for dist in 0..=dist { if let Some(m) = - ctx.get_db_word_pair_proximity_docids(s1, s2, dist as u8 + 1)? + ctx.get_db_word_pair_proximity_docids(universe, s1, s2, dist as u8 + 1)? { bitmap |= m; } From 41f51adbeceeea45ad7e08e06e908e440aa681b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 21 Jun 2024 14:56:47 +0200 Subject: [PATCH 3/9] Do less useless intersections --- milli/src/search/new/exact_attribute.rs | 26 ++++++++++++++----- .../search/new/ranking_rule_graph/fid/mod.rs | 6 ++--- .../new/ranking_rule_graph/position/mod.rs | 14 +++++----- .../proximity/compute_docids.rs | 7 ++--- milli/src/search/new/resolve_query_graph.rs | 1 + 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 52efa63a4..b013adfa0 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,3 +1,4 @@ +use heed::types::Bytes; use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; @@ -5,7 +6,7 @@ use super::ranking_rules::{RankingRule, RankingRuleOutput}; use crate::score_details::{self, ScoreDetails}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; -use crate::{Result, SearchContext, SearchLogger}; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; /// A ranking rule that produces 3 disjoint buckets: /// @@ -173,8 +174,7 @@ impl State { let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = ctx .get_db_word_position_docids(Some(universe), *word, bucketed_position)? - .unwrap_or_default() - & universe; + .unwrap_or_default(); candidates &= word_position_docids; if candidates.is_empty() { return Ok(State::Empty(query_graph.clone())); @@ -205,16 +205,24 @@ impl State { .unwrap_or_default()) }), )?; + // TODO Why not doing this intersection in the MultiOps above? intersection &= &candidates; if !intersection.is_empty() { // Although not really worth it in terms of performance, // if would be good to put this in cache for the sake of consistency let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { - ctx.index + let bitmap_bytes = ctx + .index .field_id_word_count_docids - .get(ctx.txn, &(fid, count_all_positions as u8))? - .unwrap_or_default() - & universe + .remap_data_type::() + .get(ctx.txn, &(fid, count_all_positions as u8))?; + + match bitmap_bytes { + Some(bytes) => { + CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)? + } + None => RoaringBitmap::default(), + } } else { RoaringBitmap::default() }; @@ -237,6 +245,8 @@ impl State { let (state, output) = match state { State::Uninitialized => (state, None), State::ExactAttribute(query_graph, candidates_per_attribute) => { + // TODO it can be much faster to do the intersections before the unions... + // or maybe the candidates_per_attribute are not containing anything outside universe let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( |FieldCandidates { start_with_exact, exact_word_count }| { start_with_exact & exact_word_count @@ -255,6 +265,8 @@ impl State { ) } State::AttributeStarts(query_graph, candidates_per_attribute) => { + // TODO it can be much faster to do the intersections before the unions... + // or maybe the candidates_per_attribute are not containing anything outside universe let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( |FieldCandidates { mut start_with_exact, exact_word_count }| { start_with_exact -= exact_word_count; diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index ad4a83dc3..67775ddea 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -29,14 +29,12 @@ impl RankingRuleGraphTrait for FidGraph { let FidCondition { term, .. } = condition; let docids = if let Some(fid) = condition.fid { - // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let docids = compute_query_term_subset_docids_within_field_id( + compute_query_term_subset_docids_within_field_id( ctx, Some(universe), &term.term_subset, fid, - )?; - docids & universe + )? } else { RoaringBitmap::new() }; diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index fd61c1dcc..cbfa705a3 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -28,15 +28,15 @@ impl RankingRuleGraphTrait for PositionGraph { ) -> Result { let PositionCondition { term, positions } = condition; let mut docids = RoaringBitmap::new(); + // TODO use MultiOps to do the big union for position in positions { // maybe compute_query_term_subset_docids_within_position should accept a universe as argument - docids |= universe - & compute_query_term_subset_docids_within_position( - ctx, - Some(universe), - &term.term_subset, - *position, - )?; + docids |= compute_query_term_subset_docids_within_position( + ctx, + Some(universe), + &term.term_subset, + *position, + )?; } Ok(ComputedCondition { docids, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 29459cd99..6d3199a8d 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -143,7 +143,6 @@ fn compute_prefix_edges( right_prefix, forward_proximity, )? { - let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -153,13 +152,13 @@ fn compute_prefix_edges( // No swapping when computing the proximity between a phrase and a word if left_phrase.is_none() { + // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff. if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( Some(&universe), right_prefix, left_word, backward_proximity, )? { - let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -185,9 +184,7 @@ fn compute_non_prefix_edges( let mut universe = universe.clone(); for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { - // TODO do the intersection in the method, again! - let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?; - universe &= phrase_docids; + universe &= ctx.get_phrase_docids(Some(&universe), phrase)?; if universe.is_empty() { return Ok(()); } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 6cab2f364..24e677477 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -251,6 +251,7 @@ pub fn compute_phrase_docids( // We sort the bitmaps so that we perform the small intersections first, which is faster. bitmaps.sort_unstable_by_key(|a| a.len()); + // TODO use MultiOps intersection which and remove the above sort for bitmap in bitmaps { candidates &= bitmap; From cd7a20fa32f49368ff9d2130171b860bd141841f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 21 Jun 2024 15:03:51 +0200 Subject: [PATCH 4/9] Make it work by avoid storing invalid stuff in the cache --- milli/src/search/new/mod.rs | 4 ++-- milli/src/search/new/ranking_rule_graph/exactness/mod.rs | 2 +- .../new/ranking_rule_graph/proximity/compute_docids.rs | 6 +++--- milli/src/search/new/resolve_query_graph.rs | 6 +++--- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 73aaa330f..5932346a1 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -247,14 +247,14 @@ fn resolve_negative_words( #[tracing::instrument(level = "trace", skip_all, target = "search::query")] fn resolve_negative_phrases( ctx: &mut SearchContext<'_>, - universe: Option<&RoaringBitmap>, + _universe: Option<&RoaringBitmap>, negative_phrases: &[LocatedQueryTerm], ) -> Result { let mut negative_bitmap = RoaringBitmap::new(); for term in negative_phrases { let query_term = ctx.term_interner.get(term.value); if let Some(phrase) = query_term.original_phrase() { - negative_bitmap |= ctx.get_phrase_docids(universe, phrase)?; + negative_bitmap |= ctx.get_phrase_docids(None, phrase)?; } } Ok(negative_bitmap) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index ae3a582bb..31f6315d7 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -29,7 +29,7 @@ fn compute_docids( let candidates = match exact_term { // TODO I move the intersection here - ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(Some(universe), phrase)? & universe, + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(None, phrase)? & universe, ExactTerm::Word(word) => { ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default() } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 6d3199a8d..74bef506a 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -74,7 +74,7 @@ pub fn compute_docids( if right_derivs.len() > 1 { let universe = &universe; if let Some(left_phrase) = left_phrase { - if universe.is_disjoint(ctx.get_phrase_docids(Some(universe), left_phrase)?) { + if universe.is_disjoint(ctx.get_phrase_docids(None, left_phrase)?) { continue; } } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? { @@ -126,7 +126,7 @@ fn compute_prefix_edges( // TODO we can clearly give the universe to this method // Unfortunately, it is deserializing/computing stuff and // keeping the result as a materialized bitmap. - let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?; + let phrase_docids = ctx.get_phrase_docids(None, phrase)?; if !phrase_docids.is_empty() { used_left_phrases.insert(phrase); } @@ -184,7 +184,7 @@ fn compute_non_prefix_edges( let mut universe = universe.clone(); for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { - universe &= ctx.get_phrase_docids(Some(&universe), phrase)?; + universe &= ctx.get_phrase_docids(None, phrase)?; if universe.is_empty() { return Ok(()); } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 24e677477..d9fdac86e 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -47,7 +47,7 @@ pub fn compute_query_term_subset_docids( } } for phrase in term.all_phrases(ctx)? { - docids |= ctx.get_phrase_docids(universe, phrase)?; + docids |= ctx.get_phrase_docids(None, phrase)?; } if let Some(prefix) = term.use_prefix_db(ctx) { @@ -80,7 +80,7 @@ pub fn compute_query_term_subset_docids_within_field_id( // guaranteed that all of its words are within a single fid. if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? { - docids |= ctx.get_phrase_docids(Some(&word_fid_docids), phrase)?; + docids |= ctx.get_phrase_docids(None, phrase)? & word_fid_docids; } } } @@ -118,7 +118,7 @@ pub fn compute_query_term_subset_docids_within_position( if let Some(word_position_docids) = ctx.get_db_word_position_docids(universe, *word, position)? { - docids |= ctx.get_phrase_docids(Some(&word_position_docids), phrase)?; + docids |= ctx.get_phrase_docids(None, phrase)? & word_position_docids; } } } From 93ba0510942014c8623a9d52296f84f3aea831c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 10 Jul 2024 10:03:00 +0200 Subject: [PATCH 5/9] Remove the invalid get_phrases_docids universe parameter --- .../new/ranking_rule_graph/exactness/mod.rs | 2 +- .../proximity/compute_docids.rs | 6 +++--- milli/src/search/new/resolve_query_graph.rs | 21 +++++++------------ 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs index 31f6315d7..e0c2294e5 100644 --- a/milli/src/search/new/ranking_rule_graph/exactness/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/exactness/mod.rs @@ -29,7 +29,7 @@ fn compute_docids( let candidates = match exact_term { // TODO I move the intersection here - ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(None, phrase)? & universe, + ExactTerm::Phrase(phrase) => ctx.get_phrase_docids(phrase)? & universe, ExactTerm::Word(word) => { ctx.word_docids(Some(universe), Word::Original(word))?.unwrap_or_default() } diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 74bef506a..4ef83b534 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -74,7 +74,7 @@ pub fn compute_docids( if right_derivs.len() > 1 { let universe = &universe; if let Some(left_phrase) = left_phrase { - if universe.is_disjoint(ctx.get_phrase_docids(None, left_phrase)?) { + if universe.is_disjoint(ctx.get_phrase_docids(left_phrase)?) { continue; } } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? { @@ -126,7 +126,7 @@ fn compute_prefix_edges( // TODO we can clearly give the universe to this method // Unfortunately, it is deserializing/computing stuff and // keeping the result as a materialized bitmap. - let phrase_docids = ctx.get_phrase_docids(None, phrase)?; + let phrase_docids = ctx.get_phrase_docids(phrase)?; if !phrase_docids.is_empty() { used_left_phrases.insert(phrase); } @@ -184,7 +184,7 @@ fn compute_non_prefix_edges( let mut universe = universe.clone(); for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { - universe &= ctx.get_phrase_docids(None, phrase)?; + universe &= ctx.get_phrase_docids(phrase)?; if universe.is_empty() { return Ok(()); } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index d9fdac86e..7a47b0a66 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -19,15 +19,11 @@ pub struct PhraseDocIdsCache { } impl<'ctx> SearchContext<'ctx> { /// Get the document ids associated with the given phrase - pub fn get_phrase_docids( - &mut self, - universe: Option<&RoaringBitmap>, - phrase: Interned, - ) -> Result<&RoaringBitmap> { + pub fn get_phrase_docids(&mut self, phrase: Interned) -> Result<&RoaringBitmap> { if self.phrase_docids.cache.contains_key(&phrase) { return Ok(&self.phrase_docids.cache[&phrase]); }; - let docids = compute_phrase_docids(self, universe, phrase)?; + let docids = compute_phrase_docids(self, phrase)?; // TODO can we improve that? Because there is an issue, we keep that in cache... let _ = self.phrase_docids.cache.insert(phrase, docids); let docids = &self.phrase_docids.cache[&phrase]; @@ -47,7 +43,7 @@ pub fn compute_query_term_subset_docids( } } for phrase in term.all_phrases(ctx)? { - docids |= ctx.get_phrase_docids(None, phrase)?; + docids |= ctx.get_phrase_docids(phrase)?; } if let Some(prefix) = term.use_prefix_db(ctx) { @@ -80,7 +76,7 @@ pub fn compute_query_term_subset_docids_within_field_id( // guaranteed that all of its words are within a single fid. if let Some(word) = phrase.words(ctx).iter().flatten().next() { if let Some(word_fid_docids) = ctx.get_db_word_fid_docids(universe, *word, fid)? { - docids |= ctx.get_phrase_docids(None, phrase)? & word_fid_docids; + docids |= ctx.get_phrase_docids(phrase)? & word_fid_docids; } } } @@ -118,7 +114,7 @@ pub fn compute_query_term_subset_docids_within_position( if let Some(word_position_docids) = ctx.get_db_word_position_docids(universe, *word, position)? { - docids |= ctx.get_phrase_docids(None, phrase)? & word_position_docids; + docids |= ctx.get_phrase_docids(phrase)? & word_position_docids; } } } @@ -190,7 +186,6 @@ pub fn compute_query_graph_docids( pub fn compute_phrase_docids( ctx: &mut SearchContext<'_>, - universe: Option<&RoaringBitmap>, phrase: Interned, ) -> Result { let Phrase { words } = ctx.phrase_interner.get(phrase).clone(); @@ -200,7 +195,7 @@ pub fn compute_phrase_docids( } let mut candidates = RoaringBitmap::new(); for word in words.iter().flatten().copied() { - if let Some(word_docids) = ctx.word_docids(universe, Word::Original(word))? { + if let Some(word_docids) = ctx.word_docids(None, Word::Original(word))? { candidates |= word_docids; } else { return Ok(RoaringBitmap::new()); @@ -224,7 +219,7 @@ pub fn compute_phrase_docids( .filter_map(|(index, word)| word.as_ref().map(|word| (index, word))) { if dist == 0 { - match ctx.get_db_word_pair_proximity_docids(universe, s1, s2, 1)? { + match ctx.get_db_word_pair_proximity_docids(None, s1, s2, 1)? { Some(m) => bitmaps.push(m), // If there are no documents for this pair, there will be no // results for the phrase query. @@ -234,7 +229,7 @@ pub fn compute_phrase_docids( let mut bitmap = RoaringBitmap::new(); for dist in 0..=dist { if let Some(m) = - ctx.get_db_word_pair_proximity_docids(universe, s1, s2, dist as u8 + 1)? + ctx.get_db_word_pair_proximity_docids(None, s1, s2, dist as u8 + 1)? { bitmap |= m; } From febea735caaec330ecab1c075c536b6c04dc6501 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 10 Jul 2024 10:03:20 +0200 Subject: [PATCH 6/9] Remove the unused universe parameter from resolve_negative_phrases --- milli/src/search/new/mod.rs | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 5932346a1..f6a4a802c 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -247,14 +247,13 @@ fn resolve_negative_words( #[tracing::instrument(level = "trace", skip_all, target = "search::query")] fn resolve_negative_phrases( ctx: &mut SearchContext<'_>, - _universe: Option<&RoaringBitmap>, negative_phrases: &[LocatedQueryTerm], ) -> Result { let mut negative_bitmap = RoaringBitmap::new(); for term in negative_phrases { let query_term = ctx.term_interner.get(term.value); if let Some(phrase) = query_term.original_phrase() { - negative_bitmap |= ctx.get_phrase_docids(None, phrase)?; + negative_bitmap |= ctx.get_phrase_docids(phrase)?; } } Ok(negative_bitmap) @@ -689,7 +688,7 @@ pub fn execute_search( used_negative_operator = !negative_words.is_empty() || !negative_phrases.is_empty(); let ignored_documents = resolve_negative_words(ctx, Some(&universe), &negative_words)?; - let ignored_phrases = resolve_negative_phrases(ctx, Some(&universe), &negative_phrases)?; + let ignored_phrases = resolve_negative_phrases(ctx, &negative_phrases)?; universe -= ignored_documents; universe -= ignored_phrases; From 1693d1a311cdfeef8f41c19f00e4620c2378fa91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 10 Jul 2024 10:09:39 +0200 Subject: [PATCH 7/9] Simplify the check to decide to stop a loop --- .../search/new/ranking_rule_graph/proximity/compute_docids.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 4ef83b534..68b934e20 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -78,7 +78,7 @@ pub fn compute_docids( continue; } } else if let Some(left_word_docids) = ctx.word_docids(Some(universe), left_word)? { - if universe.is_disjoint(&left_word_docids) { + if left_word_docids.is_empty() { continue; } } From ce61cb7fe61bc2f877ec3c10d666690e626543e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 10 Jul 2024 10:11:44 +0200 Subject: [PATCH 8/9] Simplify and speedup an intersection pass --- milli/src/search/new/exact_attribute.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index b013adfa0..269879ad6 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -193,7 +193,7 @@ impl State { let mut candidates_per_attribute = Vec::with_capacity(searchable_fields_ids.len()); // then check that there exists at least one attribute that has all of the terms for fid in searchable_fields_ids { - let mut intersection = MultiOps::intersection( + let intersection = MultiOps::intersection( words_positions .iter() .flat_map(|(words, ..)| words.iter()) @@ -201,12 +201,10 @@ impl State { .flatten() .map(|word| -> Result<_> { Ok(ctx - .get_db_word_fid_docids(Some(universe), *word, fid)? + .get_db_word_fid_docids(Some(&candidates), *word, fid)? .unwrap_or_default()) }), )?; - // TODO Why not doing this intersection in the MultiOps above? - intersection &= &candidates; if !intersection.is_empty() { // Although not really worth it in terms of performance, // if would be good to put this in cache for the sake of consistency From 3bac22fd87b7c5d384eb2395b087c7bc61e8f0fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 10 Jul 2024 10:12:35 +0200 Subject: [PATCH 9/9] We do not do intersections with the universe when it is related to cache --- .../search/new/ranking_rule_graph/proximity/compute_docids.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 68b934e20..39dbef46d 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -136,7 +136,6 @@ fn compute_prefix_edges( } } - // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff. if let Some(new_docids) = ctx.get_db_word_prefix_pair_proximity_docids( Some(&universe), left_word, @@ -152,7 +151,6 @@ fn compute_prefix_edges( // No swapping when computing the proximity between a phrase and a word if left_phrase.is_none() { - // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff. if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( Some(&universe), right_prefix, @@ -190,7 +188,6 @@ fn compute_non_prefix_edges( } } - // TODO check that it is not an issue to alterate the universe if let Some(new_docids) = ctx.get_db_word_pair_proximity_docids(Some(&universe), word1, word2, forward_proximity)? {