Do less useless intersections

This commit is contained in:
Clément Renault 2024-06-21 14:56:47 +02:00
parent 0ca1a4e805
commit 41f51adbec
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
5 changed files with 31 additions and 23 deletions

View File

@ -1,3 +1,4 @@
use heed::types::Bytes;
use roaring::{MultiOps, RoaringBitmap}; use roaring::{MultiOps, RoaringBitmap};
use super::query_graph::QueryGraph; use super::query_graph::QueryGraph;
@ -5,7 +6,7 @@ use super::ranking_rules::{RankingRule, RankingRuleOutput};
use crate::score_details::{self, ScoreDetails}; use crate::score_details::{self, ScoreDetails};
use crate::search::new::query_graph::QueryNodeData; use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::ExactTerm; use crate::search::new::query_term::ExactTerm;
use crate::{Result, SearchContext, SearchLogger}; use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
/// A ranking rule that produces 3 disjoint buckets: /// A ranking rule that produces 3 disjoint buckets:
/// ///
@ -173,8 +174,7 @@ impl State {
let bucketed_position = crate::bucketed_position(position + offset); let bucketed_position = crate::bucketed_position(position + offset);
let word_position_docids = ctx let word_position_docids = ctx
.get_db_word_position_docids(Some(universe), *word, bucketed_position)? .get_db_word_position_docids(Some(universe), *word, bucketed_position)?
.unwrap_or_default() .unwrap_or_default();
& universe;
candidates &= word_position_docids; candidates &= word_position_docids;
if candidates.is_empty() { if candidates.is_empty() {
return Ok(State::Empty(query_graph.clone())); return Ok(State::Empty(query_graph.clone()));
@ -205,16 +205,24 @@ impl State {
.unwrap_or_default()) .unwrap_or_default())
}), }),
)?; )?;
// TODO Why not doing this intersection in the MultiOps above?
intersection &= &candidates; intersection &= &candidates;
if !intersection.is_empty() { if !intersection.is_empty() {
// Although not really worth it in terms of performance, // Although not really worth it in terms of performance,
// if would be good to put this in cache for the sake of consistency // if would be good to put this in cache for the sake of consistency
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize { let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
ctx.index let bitmap_bytes = ctx
.index
.field_id_word_count_docids .field_id_word_count_docids
.get(ctx.txn, &(fid, count_all_positions as u8))? .remap_data_type::<Bytes>()
.unwrap_or_default() .get(ctx.txn, &(fid, count_all_positions as u8))?;
& universe
match bitmap_bytes {
Some(bytes) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
}
None => RoaringBitmap::default(),
}
} else { } else {
RoaringBitmap::default() RoaringBitmap::default()
}; };
@ -237,6 +245,8 @@ impl State {
let (state, output) = match state { let (state, output) = match state {
State::Uninitialized => (state, None), State::Uninitialized => (state, None),
State::ExactAttribute(query_graph, candidates_per_attribute) => { State::ExactAttribute(query_graph, candidates_per_attribute) => {
// TODO it can be much faster to do the intersections before the unions...
// or maybe the candidates_per_attribute are not containing anything outside universe
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map( let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|FieldCandidates { start_with_exact, exact_word_count }| { |FieldCandidates { start_with_exact, exact_word_count }| {
start_with_exact & exact_word_count start_with_exact & exact_word_count
@ -255,6 +265,8 @@ impl State {
) )
} }
State::AttributeStarts(query_graph, candidates_per_attribute) => { State::AttributeStarts(query_graph, candidates_per_attribute) => {
// TODO it can be much faster to do the intersections before the unions...
// or maybe the candidates_per_attribute are not containing anything outside universe
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map( let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|FieldCandidates { mut start_with_exact, exact_word_count }| { |FieldCandidates { mut start_with_exact, exact_word_count }| {
start_with_exact -= exact_word_count; start_with_exact -= exact_word_count;

View File

@ -29,14 +29,12 @@ impl RankingRuleGraphTrait for FidGraph {
let FidCondition { term, .. } = condition; let FidCondition { term, .. } = condition;
let docids = if let Some(fid) = condition.fid { let docids = if let Some(fid) = condition.fid {
// maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument compute_query_term_subset_docids_within_field_id(
let docids = compute_query_term_subset_docids_within_field_id(
ctx, ctx,
Some(universe), Some(universe),
&term.term_subset, &term.term_subset,
fid, fid,
)?; )?
docids & universe
} else { } else {
RoaringBitmap::new() RoaringBitmap::new()
}; };

View File

@ -28,10 +28,10 @@ impl RankingRuleGraphTrait for PositionGraph {
) -> Result<ComputedCondition> { ) -> Result<ComputedCondition> {
let PositionCondition { term, positions } = condition; let PositionCondition { term, positions } = condition;
let mut docids = RoaringBitmap::new(); let mut docids = RoaringBitmap::new();
// TODO use MultiOps to do the big union
for position in positions { for position in positions {
// maybe compute_query_term_subset_docids_within_position should accept a universe as argument // maybe compute_query_term_subset_docids_within_position should accept a universe as argument
docids |= universe docids |= compute_query_term_subset_docids_within_position(
& compute_query_term_subset_docids_within_position(
ctx, ctx,
Some(universe), Some(universe),
&term.term_subset, &term.term_subset,

View File

@ -143,7 +143,6 @@ fn compute_prefix_edges(
right_prefix, right_prefix,
forward_proximity, forward_proximity,
)? { )? {
let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_left_words.insert(left_word); used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix); used_right_prefix.insert(right_prefix);
@ -153,13 +152,13 @@ fn compute_prefix_edges(
// No swapping when computing the proximity between a phrase and a word // No swapping when computing the proximity between a phrase and a word
if left_phrase.is_none() { if left_phrase.is_none() {
// TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff.
if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids( if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
Some(&universe), Some(&universe),
right_prefix, right_prefix,
left_word, left_word,
backward_proximity, backward_proximity,
)? { )? {
let new_docids = &universe & new_docids;
if !new_docids.is_empty() { if !new_docids.is_empty() {
used_left_words.insert(left_word); used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix); used_right_prefix.insert(right_prefix);
@ -185,9 +184,7 @@ fn compute_non_prefix_edges(
let mut universe = universe.clone(); let mut universe = universe.clone();
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() { for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
// TODO do the intersection in the method, again! universe &= ctx.get_phrase_docids(Some(&universe), phrase)?;
let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
universe &= phrase_docids;
if universe.is_empty() { if universe.is_empty() {
return Ok(()); return Ok(());
} }

View File

@ -251,6 +251,7 @@ pub fn compute_phrase_docids(
// We sort the bitmaps so that we perform the small intersections first, which is faster. // We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by_key(|a| a.len()); bitmaps.sort_unstable_by_key(|a| a.len());
// TODO use MultiOps intersection which and remove the above sort
for bitmap in bitmaps { for bitmap in bitmaps {
candidates &= bitmap; candidates &= bitmap;