From a6f3a01c6a3a4d836f31544cc668c363e954e983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:10:00 -0400 Subject: [PATCH] Expose the universe to do efficient intersections on deserialization --- .../search/facet/facet_distribution_iter.rs | 4 +- milli/src/search/facet/facet_range_search.rs | 59 +++++++++++++++---- .../src/search/facet/facet_sort_ascending.rs | 2 +- .../src/search/facet/facet_sort_descending.rs | 4 +- milli/src/search/facet/filter.rs | 39 +++++++++--- milli/src/search/facet/mod.rs | 14 ++--- 6 files changed, 92 insertions(+), 30 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index d993ef2dc..a8aa1a006 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -38,7 +38,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; Ok(()) } else { @@ -81,7 +81,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { // We first fill the heap with values from the highest level let starting_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index e340fbac5..0f8f58771 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,9 +4,11 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec, +}; use crate::heed_codec::BytesRefCodec; -use crate::Result; +use crate::{CboRoaringBitmapCodec, Result}; /// Find all the document ids for which the given field contains a value contained within /// the two bounds. @@ -16,6 +18,7 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, + universe: Option<&RoaringBitmap>, docids: &mut RoaringBitmap, ) -> Result<()> where @@ -46,13 +49,15 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let db = db.remap_types::, FacetGroupLazyValueCodec>(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { let rightmost_bound = - Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -64,12 +69,16 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupLazyValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, + /// The subset of documents ids that are useful for this search. + /// Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } + impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = @@ -104,7 +113,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?, + }; } } Ok(()) @@ -195,7 +210,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; previous_key = next_key; previous_value = next_value; continue; @@ -291,7 +312,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; } else { let level = level - 1; let starting_left_bound = previous_key.left_bound; @@ -365,6 +392,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -384,6 +412,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -418,6 +447,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -439,6 +469,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -474,6 +505,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -499,6 +531,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -537,6 +570,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -556,6 +590,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -571,6 +606,7 @@ mod tests { 0, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -586,6 +622,7 @@ mod tests { 1, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -621,6 +658,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -634,6 +672,7 @@ mod tests { 1, &start, &end, + None, &mut docids, ) .unwrap(); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 07fe64510..59a95e5bd 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index dd2692012..29586e4e4 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(itertools::Either::Left(DescendingFacetSort { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index adeec45da..f5fd0f2fd 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -224,14 +224,14 @@ impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let filterable_fields = index.filterable_fields(rtxn)?; - - self.inner_evaluate(rtxn, index, &filterable_fields) + self.inner_evaluate(rtxn, index, &filterable_fields, None) } fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, field_id: FieldId, + universe: Option<&RoaringBitmap>, operator: &Condition<'a>, ) -> Result { let numbers_db = index.facet_id_f64_docids; @@ -291,14 +291,22 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; + let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } }; let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + left, + right, + universe, + &mut output, + )?; Ok(output) } @@ -310,6 +318,7 @@ impl<'a> Filter<'a> { field_id: FieldId, left: Bound, right: Bound, + universe: Option<&RoaringBitmap>, output: &mut RoaringBitmap, ) -> Result<()> { match (left, right) { @@ -321,7 +330,7 @@ impl<'a> Filter<'a> { (_, _) => (), } facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, db, field_id, &left, &right, output, + rtxn, db, field_id, &left, &right, universe, output, )?; Ok(()) @@ -332,15 +341,18 @@ impl<'a> Filter<'a> { rtxn: &heed::RoTxn, index: &Index, filterable_fields: &HashSet, + universe: Option<&RoaringBitmap>, ) -> Result { match &self.condition { FilterCondition::Not(f) => { + // TODO improve the documents_ids to also support intersections at deserialize time. let all_ids = index.documents_ids(rtxn)?; let selected = Self::inner_evaluate( &(f.as_ref().clone()).into(), rtxn, index, filterable_fields, + universe, )?; Ok(all_ids - selected) } @@ -353,7 +365,8 @@ impl<'a> Filter<'a> { for el in els { let op = Condition::Equal(el.clone()); - let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; + let el_bitmap = + Self::evaluate_operator(rtxn, index, fid, universe, &op)?; bitmap |= el_bitmap; } Ok(bitmap) @@ -371,7 +384,7 @@ impl<'a> Filter<'a> { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, op) + Self::evaluate_operator(rtxn, index, fid, universe, op) } else { Ok(RoaringBitmap::new()) } @@ -384,7 +397,8 @@ impl<'a> Filter<'a> { } FilterCondition::Or(subfilters) => subfilters .iter() - .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .cloned() + .map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); @@ -394,16 +408,21 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; for f in subfilters_iter { if bitmap.is_empty() { return Ok(bitmap); } + // TODO We are doing the intersections two times, + // it could be more efficient + // Can't I just replace this `&=` by an `=`? bitmap &= Self::inner_evaluate( &(f.clone()).into(), rtxn, index, filterable_fields, + Some(&bitmap), )?; } Ok(bitmap) @@ -503,6 +522,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let geo_lng_token = Token::new( @@ -535,6 +555,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let condition_right = FilterCondition::Condition { @@ -548,6 +569,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; left | right @@ -563,6 +585,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )? }; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 34a9cdcb8..858028bb5 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; pub use self::search::{FacetValueHit, SearchForFacetValues}; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; @@ -54,9 +54,9 @@ pub fn facet_max_value<'t>( } /// Get the first facet value in the facet database -pub(crate) fn get_first_facet_value<'t, BoundCodec>( +pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -78,9 +78,9 @@ where } /// Get the last facet value in the facet database -pub(crate) fn get_last_facet_value<'t, BoundCodec>( +pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -102,9 +102,9 @@ where } /// Get the height of the highest level in the facet database -pub(crate) fn get_highest_level<'t>( +pub(crate) fn get_highest_level<'t, DC>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes();