Improve facet distribution speed in lexico mode

This commit is contained in:
Clément Renault 2024-06-20 12:57:08 +02:00
parent e580d6b98f
commit 19d7cdc20d
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F

View File

@ -1,14 +1,17 @@
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use std::io::Cursor;
use std::ops::ControlFlow;
use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::DocumentId;
use crate::{CboRoaringBitmapCodec, DocumentId};
/// Call the given closure on the facet distribution of the candidate documents.
///
@ -31,12 +34,9 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
@ -146,7 +146,7 @@ where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
field_id: u16,
callback: CB,
}
@ -171,7 +171,10 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let any_docid_in_common = docids_in_common.min().unwrap();
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
@ -205,7 +208,10 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let cf = self.iterate(
&docids_in_common,