From 41f51adbeceeea45ad7e08e06e908e440aa681b9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 21 Jun 2024 14:56:47 +0200 Subject: [PATCH] Do less useless intersections --- milli/src/search/new/exact_attribute.rs | 26 ++++++++++++++----- .../search/new/ranking_rule_graph/fid/mod.rs | 6 ++--- .../new/ranking_rule_graph/position/mod.rs | 14 +++++----- .../proximity/compute_docids.rs | 7 ++--- milli/src/search/new/resolve_query_graph.rs | 1 + 5 files changed, 31 insertions(+), 23 deletions(-) diff --git a/milli/src/search/new/exact_attribute.rs b/milli/src/search/new/exact_attribute.rs index 52efa63a4..b013adfa0 100644 --- a/milli/src/search/new/exact_attribute.rs +++ b/milli/src/search/new/exact_attribute.rs @@ -1,3 +1,4 @@ +use heed::types::Bytes; use roaring::{MultiOps, RoaringBitmap}; use super::query_graph::QueryGraph; @@ -5,7 +6,7 @@ use super::ranking_rules::{RankingRule, RankingRuleOutput}; use crate::score_details::{self, ScoreDetails}; use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_term::ExactTerm; -use crate::{Result, SearchContext, SearchLogger}; +use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger}; /// A ranking rule that produces 3 disjoint buckets: /// @@ -173,8 +174,7 @@ impl State { let bucketed_position = crate::bucketed_position(position + offset); let word_position_docids = ctx .get_db_word_position_docids(Some(universe), *word, bucketed_position)? - .unwrap_or_default() - & universe; + .unwrap_or_default(); candidates &= word_position_docids; if candidates.is_empty() { return Ok(State::Empty(query_graph.clone())); @@ -205,16 +205,24 @@ impl State { .unwrap_or_default()) }), )?; + // TODO Why not doing this intersection in the MultiOps above? intersection &= &candidates; if !intersection.is_empty() { // Although not really worth it in terms of performance, // if would be good to put this in cache for the sake of consistency let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { - ctx.index + let bitmap_bytes = ctx + .index .field_id_word_count_docids - .get(ctx.txn, &(fid, count_all_positions as u8))? - .unwrap_or_default() - & universe + .remap_data_type::() + .get(ctx.txn, &(fid, count_all_positions as u8))?; + + match bitmap_bytes { + Some(bytes) => { + CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)? + } + None => RoaringBitmap::default(), + } } else { RoaringBitmap::default() }; @@ -237,6 +245,8 @@ impl State { let (state, output) = match state { State::Uninitialized => (state, None), State::ExactAttribute(query_graph, candidates_per_attribute) => { + // TODO it can be much faster to do the intersections before the unions... + // or maybe the candidates_per_attribute are not containing anything outside universe let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( |FieldCandidates { start_with_exact, exact_word_count }| { start_with_exact & exact_word_count @@ -255,6 +265,8 @@ impl State { ) } State::AttributeStarts(query_graph, candidates_per_attribute) => { + // TODO it can be much faster to do the intersections before the unions... + // or maybe the candidates_per_attribute are not containing anything outside universe let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( |FieldCandidates { mut start_with_exact, exact_word_count }| { start_with_exact -= exact_word_count; diff --git a/milli/src/search/new/ranking_rule_graph/fid/mod.rs b/milli/src/search/new/ranking_rule_graph/fid/mod.rs index ad4a83dc3..67775ddea 100644 --- a/milli/src/search/new/ranking_rule_graph/fid/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/fid/mod.rs @@ -29,14 +29,12 @@ impl RankingRuleGraphTrait for FidGraph { let FidCondition { term, .. } = condition; let docids = if let Some(fid) = condition.fid { - // maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument - let docids = compute_query_term_subset_docids_within_field_id( + compute_query_term_subset_docids_within_field_id( ctx, Some(universe), &term.term_subset, fid, - )?; - docids & universe + )? } else { RoaringBitmap::new() }; diff --git a/milli/src/search/new/ranking_rule_graph/position/mod.rs b/milli/src/search/new/ranking_rule_graph/position/mod.rs index fd61c1dcc..cbfa705a3 100644 --- a/milli/src/search/new/ranking_rule_graph/position/mod.rs +++ b/milli/src/search/new/ranking_rule_graph/position/mod.rs @@ -28,15 +28,15 @@ impl RankingRuleGraphTrait for PositionGraph { ) -> Result { let PositionCondition { term, positions } = condition; let mut docids = RoaringBitmap::new(); + // TODO use MultiOps to do the big union for position in positions { // maybe compute_query_term_subset_docids_within_position should accept a universe as argument - docids |= universe - & compute_query_term_subset_docids_within_position( - ctx, - Some(universe), - &term.term_subset, - *position, - )?; + docids |= compute_query_term_subset_docids_within_position( + ctx, + Some(universe), + &term.term_subset, + *position, + )?; } Ok(ComputedCondition { docids, diff --git a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs index 29459cd99..6d3199a8d 100644 --- a/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs +++ b/milli/src/search/new/ranking_rule_graph/proximity/compute_docids.rs @@ -143,7 +143,6 @@ fn compute_prefix_edges( right_prefix, forward_proximity, )? { - let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -153,13 +152,13 @@ fn compute_prefix_edges( // No swapping when computing the proximity between a phrase and a word if left_phrase.is_none() { + // TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff. if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( Some(&universe), right_prefix, left_word, backward_proximity, )? { - let new_docids = &universe & new_docids; if !new_docids.is_empty() { used_left_words.insert(left_word); used_right_prefix.insert(right_prefix); @@ -185,9 +184,7 @@ fn compute_non_prefix_edges( let mut universe = universe.clone(); for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { - // TODO do the intersection in the method, again! - let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?; - universe &= phrase_docids; + universe &= ctx.get_phrase_docids(Some(&universe), phrase)?; if universe.is_empty() { return Ok(()); } diff --git a/milli/src/search/new/resolve_query_graph.rs b/milli/src/search/new/resolve_query_graph.rs index 6cab2f364..24e677477 100644 --- a/milli/src/search/new/resolve_query_graph.rs +++ b/milli/src/search/new/resolve_query_graph.rs @@ -251,6 +251,7 @@ pub fn compute_phrase_docids( // We sort the bitmaps so that we perform the small intersections first, which is faster. bitmaps.sort_unstable_by_key(|a| a.len()); + // TODO use MultiOps intersection which and remove the above sort for bitmap in bitmaps { candidates &= bitmap;