Expose the universe to do efficient intersections on deserialization

This commit is contained in:
Clément Renault 2024-06-05 15:10:00 -04:00
parent 4ca4a3f954
commit a6f3a01c6a
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
6 changed files with 92 additions and 30 deletions

View File

@ -38,7 +38,7 @@ where
field_id, field_id,
)?; )?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
Ok(()) Ok(())
} else { } else {
@ -81,7 +81,7 @@ where
field_id, field_id,
)?; )?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level // We first fill the heap with values from the highest level
let starting_key = let starting_key =
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };

View File

@ -4,9 +4,11 @@ use heed::BytesEncode;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; use crate::heed_codec::facet::{
FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec,
};
use crate::heed_codec::BytesRefCodec; use crate::heed_codec::BytesRefCodec;
use crate::Result; use crate::{CboRoaringBitmapCodec, Result};
/// Find all the document ids for which the given field contains a value contained within /// Find all the document ids for which the given field contains a value contained within
/// the two bounds. /// the two bounds.
@ -16,6 +18,7 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
field_id: u16, field_id: u16,
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>, left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>, right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
universe: Option<&RoaringBitmap>,
docids: &mut RoaringBitmap, docids: &mut RoaringBitmap,
) -> Result<()> ) -> Result<()>
where where
@ -46,13 +49,15 @@ where
} }
Bound::Unbounded => Bound::Unbounded, Bound::Unbounded => Bound::Unbounded,
}; };
let db = db.remap_key_type::<FacetGroupKeyCodec<BytesRefCodec>>(); let db = db.remap_types::<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>();
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids };
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(starting_left_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { if let Some(starting_left_bound) =
get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?
{
let rightmost_bound = let rightmost_bound =
Bound::Included(get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded Bound::Included(get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded
let group_size = usize::MAX; let group_size = usize::MAX;
f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?;
Ok(()) Ok(())
@ -64,12 +69,16 @@ where
/// Fetch the document ids that have a facet with a value between the two given bounds /// Fetch the document ids that have a facet with a value between the two given bounds
struct FacetRangeSearch<'t, 'b, 'bitmap> { struct FacetRangeSearch<'t, 'b, 'bitmap> {
rtxn: &'t heed::RoTxn<'t>, rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupLazyValueCodec>,
field_id: u16, field_id: u16,
left: Bound<&'b [u8]>, left: Bound<&'b [u8]>,
right: Bound<&'b [u8]>, right: Bound<&'b [u8]>,
/// The subset of documents ids that are useful for this search.
/// Great performance optimizations can be achieved by only fetching values matching this subset.
universe: Option<&'bitmap RoaringBitmap>,
docids: &'bitmap mut RoaringBitmap, docids: &'bitmap mut RoaringBitmap,
} }
impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> {
let left_key = let left_key =
@ -104,7 +113,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
} }
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
*self.docids |= value.bitmap; *self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?,
};
} }
} }
Ok(()) Ok(())
@ -195,7 +210,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
left_condition && right_condition left_condition && right_condition
}; };
if should_take_whole_group { if should_take_whole_group {
*self.docids |= &previous_value.bitmap; *self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
previous_key = next_key; previous_key = next_key;
previous_value = next_value; previous_value = next_value;
continue; continue;
@ -291,7 +312,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
left_condition && right_condition left_condition && right_condition
}; };
if should_take_whole_group { if should_take_whole_group {
*self.docids |= &previous_value.bitmap; *self.docids |= match self.universe {
Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized(
previous_value.bitmap_bytes,
universe,
)?,
None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?,
};
} else { } else {
let level = level - 1; let level = level - 1;
let starting_left_bound = previous_key.left_bound; let starting_left_bound = previous_key.left_bound;
@ -365,6 +392,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -384,6 +412,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -418,6 +447,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -439,6 +469,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -474,6 +505,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -499,6 +531,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -537,6 +570,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -556,6 +590,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -571,6 +606,7 @@ mod tests {
0, 0,
&Bound::Unbounded, &Bound::Unbounded,
&Bound::Unbounded, &Bound::Unbounded,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -586,6 +622,7 @@ mod tests {
1, 1,
&Bound::Unbounded, &Bound::Unbounded,
&Bound::Unbounded, &Bound::Unbounded,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -621,6 +658,7 @@ mod tests {
0, 0,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();
@ -634,6 +672,7 @@ mod tests {
1, 1,
&start, &start,
&end, &end,
None,
&mut docids, &mut docids,
) )
.unwrap(); .unwrap();

View File

@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>(
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> { ) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);

View File

@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>(
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> { ) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id)?; let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<BytesRefCodec>(rtxn, db, field_id)? { if let Some(first_bound) = get_first_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)? {
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
let last_bound = get_last_facet_value::<BytesRefCodec>(rtxn, db, field_id)?.unwrap(); let last_bound = get_last_facet_value::<BytesRefCodec, _>(rtxn, db, field_id)?.unwrap();
let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound };
let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
Ok(itertools::Either::Left(DescendingFacetSort { Ok(itertools::Either::Left(DescendingFacetSort {

View File

@ -224,14 +224,14 @@ impl<'a> Filter<'a> {
pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result<RoaringBitmap> {
// to avoid doing this for each recursive call we're going to do it ONCE ahead of time // to avoid doing this for each recursive call we're going to do it ONCE ahead of time
let filterable_fields = index.filterable_fields(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?;
self.inner_evaluate(rtxn, index, &filterable_fields, None)
self.inner_evaluate(rtxn, index, &filterable_fields)
} }
fn evaluate_operator( fn evaluate_operator(
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
field_id: FieldId, field_id: FieldId,
universe: Option<&RoaringBitmap>,
operator: &Condition<'a>, operator: &Condition<'a>,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
let numbers_db = index.facet_id_f64_docids; let numbers_db = index.facet_id_f64_docids;
@ -291,14 +291,22 @@ impl<'a> Filter<'a> {
} }
Condition::NotEqual(val) => { Condition::NotEqual(val) => {
let operator = Condition::Equal(val.clone()); let operator = Condition::Equal(val.clone());
let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?;
let all_ids = index.documents_ids(rtxn)?; let all_ids = index.documents_ids(rtxn)?;
return Ok(all_ids - docids); return Ok(all_ids - docids);
} }
}; };
let mut output = RoaringBitmap::new(); let mut output = RoaringBitmap::new();
Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; Self::explore_facet_number_levels(
rtxn,
numbers_db,
field_id,
left,
right,
universe,
&mut output,
)?;
Ok(output) Ok(output)
} }
@ -310,6 +318,7 @@ impl<'a> Filter<'a> {
field_id: FieldId, field_id: FieldId,
left: Bound<f64>, left: Bound<f64>,
right: Bound<f64>, right: Bound<f64>,
universe: Option<&RoaringBitmap>,
output: &mut RoaringBitmap, output: &mut RoaringBitmap,
) -> Result<()> { ) -> Result<()> {
match (left, right) { match (left, right) {
@ -321,7 +330,7 @@ impl<'a> Filter<'a> {
(_, _) => (), (_, _) => (),
} }
facet_range_search::find_docids_of_facet_within_bounds::<OrderedF64Codec>( facet_range_search::find_docids_of_facet_within_bounds::<OrderedF64Codec>(
rtxn, db, field_id, &left, &right, output, rtxn, db, field_id, &left, &right, universe, output,
)?; )?;
Ok(()) Ok(())
@ -332,15 +341,18 @@ impl<'a> Filter<'a> {
rtxn: &heed::RoTxn, rtxn: &heed::RoTxn,
index: &Index, index: &Index,
filterable_fields: &HashSet<String>, filterable_fields: &HashSet<String>,
universe: Option<&RoaringBitmap>,
) -> Result<RoaringBitmap> { ) -> Result<RoaringBitmap> {
match &self.condition { match &self.condition {
FilterCondition::Not(f) => { FilterCondition::Not(f) => {
// TODO improve the documents_ids to also support intersections at deserialize time.
let all_ids = index.documents_ids(rtxn)?; let all_ids = index.documents_ids(rtxn)?;
let selected = Self::inner_evaluate( let selected = Self::inner_evaluate(
&(f.as_ref().clone()).into(), &(f.as_ref().clone()).into(),
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
Ok(all_ids - selected) Ok(all_ids - selected)
} }
@ -353,7 +365,8 @@ impl<'a> Filter<'a> {
for el in els { for el in els {
let op = Condition::Equal(el.clone()); let op = Condition::Equal(el.clone());
let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; let el_bitmap =
Self::evaluate_operator(rtxn, index, fid, universe, &op)?;
bitmap |= el_bitmap; bitmap |= el_bitmap;
} }
Ok(bitmap) Ok(bitmap)
@ -371,7 +384,7 @@ impl<'a> Filter<'a> {
if crate::is_faceted(fid.value(), filterable_fields) { if crate::is_faceted(fid.value(), filterable_fields) {
let field_ids_map = index.fields_ids_map(rtxn)?; let field_ids_map = index.fields_ids_map(rtxn)?;
if let Some(fid) = field_ids_map.id(fid.value()) { if let Some(fid) = field_ids_map.id(fid.value()) {
Self::evaluate_operator(rtxn, index, fid, op) Self::evaluate_operator(rtxn, index, fid, universe, op)
} else { } else {
Ok(RoaringBitmap::new()) Ok(RoaringBitmap::new())
} }
@ -384,7 +397,8 @@ impl<'a> Filter<'a> {
} }
FilterCondition::Or(subfilters) => subfilters FilterCondition::Or(subfilters) => subfilters
.iter() .iter()
.map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) .cloned()
.map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe))
.union(), .union(),
FilterCondition::And(subfilters) => { FilterCondition::And(subfilters) => {
let mut subfilters_iter = subfilters.iter(); let mut subfilters_iter = subfilters.iter();
@ -394,16 +408,21 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
for f in subfilters_iter { for f in subfilters_iter {
if bitmap.is_empty() { if bitmap.is_empty() {
return Ok(bitmap); return Ok(bitmap);
} }
// TODO We are doing the intersections two times,
// it could be more efficient
// Can't I just replace this `&=` by an `=`?
bitmap &= Self::inner_evaluate( bitmap &= Self::inner_evaluate(
&(f.clone()).into(), &(f.clone()).into(),
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
Some(&bitmap),
)?; )?;
} }
Ok(bitmap) Ok(bitmap)
@ -503,6 +522,7 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
let geo_lng_token = Token::new( let geo_lng_token = Token::new(
@ -535,6 +555,7 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
let condition_right = FilterCondition::Condition { let condition_right = FilterCondition::Condition {
@ -548,6 +569,7 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)?; )?;
left | right left | right
@ -563,6 +585,7 @@ impl<'a> Filter<'a> {
rtxn, rtxn,
index, index,
filterable_fields, filterable_fields,
universe,
)? )?
}; };

View File

@ -7,7 +7,7 @@ use roaring::RoaringBitmap;
pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET};
pub use self::filter::{BadGeoError, Filter}; pub use self::filter::{BadGeoError, Filter};
pub use self::search::{FacetValueHit, SearchForFacetValues}; pub use self::search::{FacetValueHit, SearchForFacetValues};
use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec};
use crate::heed_codec::BytesRefCodec; use crate::heed_codec::BytesRefCodec;
use crate::{Index, Result}; use crate::{Index, Result};
@ -54,9 +54,9 @@ pub fn facet_max_value<'t>(
} }
/// Get the first facet value in the facet database /// Get the first facet value in the facet database
pub(crate) fn get_first_facet_value<'t, BoundCodec>( pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>(
txn: &'t RoTxn, txn: &'t RoTxn,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
field_id: u16, field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>> ) -> heed::Result<Option<BoundCodec::DItem>>
where where
@ -78,9 +78,9 @@ where
} }
/// Get the last facet value in the facet database /// Get the last facet value in the facet database
pub(crate) fn get_last_facet_value<'t, BoundCodec>( pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>(
txn: &'t RoTxn, txn: &'t RoTxn,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
field_id: u16, field_id: u16,
) -> heed::Result<Option<BoundCodec::DItem>> ) -> heed::Result<Option<BoundCodec::DItem>>
where where
@ -102,9 +102,9 @@ where
} }
/// Get the height of the highest level in the facet database /// Get the height of the highest level in the facet database
pub(crate) fn get_highest_level<'t>( pub(crate) fn get_highest_level<'t, DC>(
txn: &'t RoTxn<'t>, txn: &'t RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, FacetGroupValueCodec>, db: heed::Database<FacetGroupKeyCodec<BytesRefCodec>, DC>,
field_id: u16, field_id: u16,
) -> heed::Result<u8> { ) -> heed::Result<u8> {
let field_id_prefix = &field_id.to_be_bytes(); let field_id_prefix = &field_id.to_be_bytes();