Do less useless intersections

This commit is contained in:
Clément Renault 2024-06-21 14:56:47 +02:00
parent 5c25f08b91
commit 20862eceb3
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
5 changed files with 31 additions and 23 deletions

View File

@ -1,3 +1,4 @@
use heed::types::Bytes;
use roaring::{MultiOps, RoaringBitmap};
use super::query_graph::QueryGraph;
@ -5,7 +6,7 @@ use super::ranking_rules::{RankingRule, RankingRuleOutput};
use crate::score_details::{self, ScoreDetails};
use crate::search::new::query_graph::QueryNodeData;
use crate::search::new::query_term::ExactTerm;
use crate::{Result, SearchContext, SearchLogger};
use crate::{CboRoaringBitmapCodec, Result, SearchContext, SearchLogger};
/// A ranking rule that produces 3 disjoint buckets:
///
@ -170,8 +171,7 @@ impl State {
let bucketed_position = crate::bucketed_position(position + offset);
let word_position_docids = ctx
.get_db_word_position_docids(Some(universe), *word, bucketed_position)?
.unwrap_or_default()
& universe;
.unwrap_or_default();
candidates &= word_position_docids;
if candidates.is_empty() {
return Ok(State::Empty(query_graph.clone()));
@ -202,16 +202,24 @@ impl State {
.unwrap_or_default())
}),
)?;
// TODO Why not doing this intersection in the MultiOps above?
intersection &= &candidates;
if !intersection.is_empty() {
// Although not really worth it in terms of performance,
// if would be good to put this in cache for the sake of consistency
let candidates_with_exact_word_count = if count_all_positions < u8::MAX as usize {
ctx.index
let bitmap_bytes = ctx
.index
.field_id_word_count_docids
.get(ctx.txn, &(fid, count_all_positions as u8))?
.unwrap_or_default()
& universe
.remap_data_type::<Bytes>()
.get(ctx.txn, &(fid, count_all_positions as u8))?;
match bitmap_bytes {
Some(bytes) => {
CboRoaringBitmapCodec::intersection_with_serialized(bytes, universe)?
}
None => RoaringBitmap::default(),
}
} else {
RoaringBitmap::default()
};
@ -234,6 +242,8 @@ impl State {
let (state, output) = match state {
State::Uninitialized => (state, None),
State::ExactAttribute(query_graph, candidates_per_attribute) => {
// TODO it can be much faster to do the intersections before the unions...
// or maybe the candidates_per_attribute are not containing anything outside universe
let mut candidates = MultiOps::union(candidates_per_attribute.iter().map(
|FieldCandidates { start_with_exact, exact_word_count }| {
start_with_exact & exact_word_count
@ -252,6 +262,8 @@ impl State {
)
}
State::AttributeStarts(query_graph, candidates_per_attribute) => {
// TODO it can be much faster to do the intersections before the unions...
// or maybe the candidates_per_attribute are not containing anything outside universe
let mut candidates = MultiOps::union(candidates_per_attribute.into_iter().map(
|FieldCandidates { mut start_with_exact, exact_word_count }| {
start_with_exact -= exact_word_count;

View File

@ -28,14 +28,12 @@ impl RankingRuleGraphTrait for FidGraph {
let FidCondition { term, .. } = condition;
let docids = if let Some(fid) = condition.fid {
// maybe compute_query_term_subset_docids_within_field_id should accept a universe as argument
let docids = compute_query_term_subset_docids_within_field_id(
compute_query_term_subset_docids_within_field_id(
ctx,
Some(universe),
&term.term_subset,
fid,
)?;
docids & universe
)?
} else {
RoaringBitmap::new()
};

View File

@ -27,15 +27,15 @@ impl RankingRuleGraphTrait for PositionGraph {
) -> Result<ComputedCondition> {
let PositionCondition { term, positions } = condition;
let mut docids = RoaringBitmap::new();
// TODO use MultiOps to do the big union
for position in positions {
// maybe compute_query_term_subset_docids_within_position should accept a universe as argument
docids |= universe
& compute_query_term_subset_docids_within_position(
ctx,
Some(universe),
&term.term_subset,
*position,
)?;
docids |= compute_query_term_subset_docids_within_position(
ctx,
Some(universe),
&term.term_subset,
*position,
)?;
}
Ok(ComputedCondition {
docids,

View File

@ -143,7 +143,6 @@ fn compute_prefix_edges(
right_prefix,
forward_proximity,
)? {
let new_docids = &universe & new_docids;
if !new_docids.is_empty() {
used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix);
@ -153,13 +152,13 @@ fn compute_prefix_edges(
// No swapping when computing the proximity between a phrase and a word
if left_phrase.is_none() {
// TODO check that the fact that the universe always changes is not an issue, e.g. caching stuff.
if let Some(new_docids) = ctx.get_db_prefix_word_pair_proximity_docids(
Some(&universe),
right_prefix,
left_word,
backward_proximity,
)? {
let new_docids = &universe & new_docids;
if !new_docids.is_empty() {
used_left_words.insert(left_word);
used_right_prefix.insert(right_prefix);
@ -185,9 +184,7 @@ fn compute_non_prefix_edges(
let mut universe = universe.clone();
for phrase in left_phrase.iter().chain(right_phrase.iter()).copied() {
// TODO do the intersection in the method, again!
let phrase_docids = ctx.get_phrase_docids(Some(&universe), phrase)?;
universe &= phrase_docids;
universe &= ctx.get_phrase_docids(Some(&universe), phrase)?;
if universe.is_empty() {
return Ok(());
}

View File

@ -251,6 +251,7 @@ pub fn compute_phrase_docids(
// We sort the bitmaps so that we perform the small intersections first, which is faster.
bitmaps.sort_unstable_by_key(|a| a.len());
// TODO use MultiOps intersection which and remove the above sort
for bitmap in bitmaps {
candidates &= bitmap;