4713: Speed up facet distribution r=ManyTheFish a=Kerollmops

This PR is akin to #4682, but this time, the same logic is applied to the facets. Bitmaps are not decoded, and we do an intersection on the bytes with the search candidates instead of materializing the RoaringBitmap to destroy it just after the operation.

A prospect raised some slow requests when performing facet searches, and I found out that the disk optimization intersection wasn't performed on the facets.

Co-authored-by: Clément Renault <clement@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-06-24 05:23:46 +00:00 committed by GitHub
commit ddd564665b
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 27 additions and 19 deletions

View File

@ -6,9 +6,11 @@ use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec;
use crate::DocumentId;
use crate::{CboRoaringBitmapCodec, DocumentId};
/// Call the given closure on the facet distribution of the candidate documents.
///
@ -31,12 +33,9 @@ pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
@ -75,11 +74,8 @@ where
// Represents the list of keys that we must explore.
let mut heap = BinaryHeap::new();
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(),
field_id,
)?;
let db = db.remap_data_type::<FacetGroupLazyValueCodec>();
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level
@ -92,7 +88,10 @@ where
if key.field_id != field_id {
break;
}
let intersection = value.bitmap & candidates;
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
@ -121,7 +120,10 @@ where
if key.field_id != field_id {
break;
}
let intersection = value.bitmap & candidates;
let intersection = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
let count = intersection.len();
if count != 0 {
heap.push(LevelEntry {
@ -146,7 +148,7 @@ where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
field_id: u16,
callback: CB,
}
@ -171,7 +173,10 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let any_docid_in_common = docids_in_common.min().unwrap();
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
@ -205,7 +210,10 @@ where
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
let docids_in_common = CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
candidates,
)?;
if !docids_in_common.is_empty() {
let cf = self.iterate(
&docids_in_common,

View File

@ -290,7 +290,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
regenerate_if_prompt_changed(
obkv,
(old_prompt, prompt),
(&old_fields_ids_map, &new_fields_ids_map),
(old_fields_ids_map, new_fields_ids_map),
)?
} else {
// we can simply ignore user provided vectors as they are not regenerated and are
@ -306,7 +306,7 @@ pub fn extract_vector_points<R: io::Read + io::Seek>(
prompt,
(add_to_user_provided, remove_from_user_provided),
(old, new),
(&old_fields_ids_map, &new_fields_ids_map),
(old_fields_ids_map, new_fields_ids_map),
document_id,
)?,
};