From 531e3d7d6ad7b09d81561502a0234afd864cdfe9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 4 Jun 2024 15:01:26 -0400 Subject: [PATCH 01/13] MultiOps trait for OR operations --- milli/src/search/facet/filter.rs | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index dbd9538a5..adeec45da 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -4,7 +4,7 @@ use std::ops::Bound::{self, Excluded, Included}; use either::Either; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Token}; -use roaring::RoaringBitmap; +use roaring::{MultiOps, RoaringBitmap}; use serde_json::Value; use super::facet_range_search; @@ -382,14 +382,10 @@ impl<'a> Filter<'a> { }))? } } - FilterCondition::Or(subfilters) => { - let mut bitmap = RoaringBitmap::new(); - for f in subfilters { - bitmap |= - Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)?; - } - Ok(bitmap) - } + FilterCondition::Or(subfilters) => subfilters + .iter() + .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); if let Some(first_subfilter) = subfilters_iter.next() { From ff2e498267c367a17e9a74da5c3e309d61db1bf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:23:26 -0400 Subject: [PATCH 02/13] Patch roaring to use the version supporting intersection on deserialization --- Cargo.lock | 12 ++---------- Cargo.toml | 3 +++ 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b62a61f92..e72d72251 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4377,12 +4377,6 @@ dependencies = [ "winreg", ] -[[package]] -name = "retain_mut" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8c31b5c4033f8fdde8700e4657be2c497e7288f01515be52168c631e2e4d4086" - [[package]] name = "ring" version = "0.17.8" @@ -4400,13 +4394,11 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6106b5cf8587f5834158895e9715a3c6c9716c8aefab57f1f7680917191c7873" +version = "0.10.4" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#4466ae0104ed44a8cf41d187d9359483fe190701" dependencies = [ "bytemuck", "byteorder", - "retain_mut", "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 5c6c8b376..f49d1fd44 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,3 +64,6 @@ opt-level = 3 opt-level = 3 [profile.bench.package.yada] opt-level = 3 + +[patch.crates-io] +roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "intersection-with-serialized" } From e4a69c5ac320a9f6c9f160c84bcf1346353c1aea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:06:25 -0400 Subject: [PATCH 03/13] Introduce the FacetGroupLazyValue type --- milli/src/heed_codec/facet/mod.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 7bb874060..a8bb5055e 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -47,6 +47,12 @@ pub struct FacetGroupValue { pub bitmap: RoaringBitmap, } +#[derive(Debug)] +pub struct FacetGroupLazyValue<'b> { + pub size: u8, + pub bitmap_bytes: &'b [u8], +} + pub struct FacetGroupKeyCodec { _phantom: PhantomData, } @@ -69,6 +75,7 @@ where Ok(Cow::Owned(v)) } } + impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec where T: BytesDecode<'a>, @@ -84,6 +91,7 @@ where } pub struct FacetGroupValueCodec; + impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { type EItem = FacetGroupValue; @@ -93,11 +101,23 @@ impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec { Ok(Cow::Owned(v)) } } + impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec { type DItem = FacetGroupValue; + fn bytes_decode(bytes: &'a [u8]) -> Result { let size = bytes[0]; let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?; Ok(FacetGroupValue { size, bitmap }) } } + +pub struct FacetGroupLazyValueCodec; + +impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec { + type DItem = FacetGroupLazyValue<'a>; + + fn bytes_decode(bytes: &'a [u8]) -> Result { + Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] }) + } +} From 4ca4a3f954b2a2df17a6aaf0ccf77fafccd65d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:06:57 -0400 Subject: [PATCH 04/13] Make the CboRoaringBitmapCodec support intersection on deserialization --- .../cbo_roaring_bitmap_codec.rs | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs index 1db518c7d..a04698019 100644 --- a/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs +++ b/milli/src/heed_codec/roaring_bitmap/cbo_roaring_bitmap_codec.rs @@ -1,5 +1,5 @@ use std::borrow::Cow; -use std::io; +use std::io::{self, Cursor}; use std::mem::size_of; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; @@ -57,6 +57,24 @@ impl CboRoaringBitmapCodec { } } + pub fn intersection_with_serialized( + mut bytes: &[u8], + other: &RoaringBitmap, + ) -> io::Result { + // See above `deserialize_from` method for implementation details. + if bytes.len() <= THRESHOLD * size_of::() { + let mut bitmap = RoaringBitmap::new(); + while let Ok(integer) = bytes.read_u32::() { + if other.contains(integer) { + bitmap.insert(integer); + } + } + Ok(bitmap) + } else { + other.intersection_with_serialized_unchecked(Cursor::new(bytes)) + } + } + /// Merge serialized CboRoaringBitmaps in a buffer. /// /// if the merged values length is under the threshold, values are directly From a6f3a01c6a3a4d836f31544cc668c363e954e983 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 15:10:00 -0400 Subject: [PATCH 05/13] Expose the universe to do efficient intersections on deserialization --- .../search/facet/facet_distribution_iter.rs | 4 +- milli/src/search/facet/facet_range_search.rs | 59 +++++++++++++++---- .../src/search/facet/facet_sort_ascending.rs | 2 +- .../src/search/facet/facet_sort_descending.rs | 4 +- milli/src/search/facet/filter.rs | 39 +++++++++--- milli/src/search/facet/mod.rs | 14 ++--- 6 files changed, 92 insertions(+), 30 deletions(-) diff --git a/milli/src/search/facet/facet_distribution_iter.rs b/milli/src/search/facet/facet_distribution_iter.rs index d993ef2dc..a8aa1a006 100644 --- a/milli/src/search/facet/facet_distribution_iter.rs +++ b/milli/src/search/facet/facet_distribution_iter.rs @@ -38,7 +38,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { fd.iterate(candidates, highest_level, first_bound, usize::MAX)?; Ok(()) } else { @@ -81,7 +81,7 @@ where field_id, )?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { // We first fill the heap with values from the highest level let starting_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index e340fbac5..0f8f58771 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -4,9 +4,11 @@ use heed::BytesEncode; use roaring::RoaringBitmap; use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; -use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec}; +use crate::heed_codec::facet::{ + FacetGroupKey, FacetGroupKeyCodec, FacetGroupLazyValueCodec, FacetGroupValueCodec, +}; use crate::heed_codec::BytesRefCodec; -use crate::Result; +use crate::{CboRoaringBitmapCodec, Result}; /// Find all the document ids for which the given field contains a value contained within /// the two bounds. @@ -16,6 +18,7 @@ pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>( field_id: u16, left: &'t Bound<>::EItem>, right: &'t Bound<>::EItem>, + universe: Option<&RoaringBitmap>, docids: &mut RoaringBitmap, ) -> Result<()> where @@ -46,13 +49,15 @@ where } Bound::Unbounded => Bound::Unbounded, }; - let db = db.remap_key_type::>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let db = db.remap_types::, FacetGroupLazyValueCodec>(); + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(starting_left_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(starting_left_bound) = + get_first_facet_value::(rtxn, db, field_id)? + { let rightmost_bound = - Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded + Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) @@ -64,12 +69,16 @@ where /// Fetch the document ids that have a facet with a value between the two given bounds struct FacetRangeSearch<'t, 'b, 'bitmap> { rtxn: &'t heed::RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, FacetGroupLazyValueCodec>, field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, + /// The subset of documents ids that are useful for this search. + /// Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } + impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = @@ -104,7 +113,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(value.bitmap_bytes)?, + }; } } Ok(()) @@ -195,7 +210,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; previous_key = next_key; previous_value = next_value; continue; @@ -291,7 +312,13 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { left_condition && right_condition }; if should_take_whole_group { - *self.docids |= &previous_value.bitmap; + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, + None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, + }; } else { let level = level - 1; let starting_left_bound = previous_key.left_bound; @@ -365,6 +392,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -384,6 +412,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -418,6 +447,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -439,6 +469,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -474,6 +505,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -499,6 +531,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -537,6 +570,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -556,6 +590,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -571,6 +606,7 @@ mod tests { 0, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -586,6 +622,7 @@ mod tests { 1, &Bound::Unbounded, &Bound::Unbounded, + None, &mut docids, ) .unwrap(); @@ -621,6 +658,7 @@ mod tests { 0, &start, &end, + None, &mut docids, ) .unwrap(); @@ -634,6 +672,7 @@ mod tests { 1, &start, &end, + None, &mut docids, ) .unwrap(); diff --git a/milli/src/search/facet/facet_sort_ascending.rs b/milli/src/search/facet/facet_sort_ascending.rs index 07fe64510..59a95e5bd 100644 --- a/milli/src/search/facet/facet_sort_ascending.rs +++ b/milli/src/search/facet/facet_sort_ascending.rs @@ -36,7 +36,7 @@ pub fn ascending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); diff --git a/milli/src/search/facet/facet_sort_descending.rs b/milli/src/search/facet/facet_sort_descending.rs index dd2692012..29586e4e4 100644 --- a/milli/src/search/facet/facet_sort_descending.rs +++ b/milli/src/search/facet/facet_sort_descending.rs @@ -19,9 +19,9 @@ pub fn descending_facet_sort<'t>( candidates: RoaringBitmap, ) -> Result> + 't> { let highest_level = get_highest_level(rtxn, db, field_id)?; - if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { + if let Some(first_bound) = get_first_facet_value::(rtxn, db, field_id)? { let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound }; - let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); + let last_bound = get_last_facet_value::(rtxn, db, field_id)?.unwrap(); let last_key = FacetGroupKey { field_id, level: highest_level, left_bound: last_bound }; let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX); Ok(itertools::Either::Left(DescendingFacetSort { diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index adeec45da..f5fd0f2fd 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -224,14 +224,14 @@ impl<'a> Filter<'a> { pub fn evaluate(&self, rtxn: &heed::RoTxn, index: &Index) -> Result { // to avoid doing this for each recursive call we're going to do it ONCE ahead of time let filterable_fields = index.filterable_fields(rtxn)?; - - self.inner_evaluate(rtxn, index, &filterable_fields) + self.inner_evaluate(rtxn, index, &filterable_fields, None) } fn evaluate_operator( rtxn: &heed::RoTxn, index: &Index, field_id: FieldId, + universe: Option<&RoaringBitmap>, operator: &Condition<'a>, ) -> Result { let numbers_db = index.facet_id_f64_docids; @@ -291,14 +291,22 @@ impl<'a> Filter<'a> { } Condition::NotEqual(val) => { let operator = Condition::Equal(val.clone()); - let docids = Self::evaluate_operator(rtxn, index, field_id, &operator)?; + let docids = Self::evaluate_operator(rtxn, index, field_id, None, &operator)?; let all_ids = index.documents_ids(rtxn)?; return Ok(all_ids - docids); } }; let mut output = RoaringBitmap::new(); - Self::explore_facet_number_levels(rtxn, numbers_db, field_id, left, right, &mut output)?; + Self::explore_facet_number_levels( + rtxn, + numbers_db, + field_id, + left, + right, + universe, + &mut output, + )?; Ok(output) } @@ -310,6 +318,7 @@ impl<'a> Filter<'a> { field_id: FieldId, left: Bound, right: Bound, + universe: Option<&RoaringBitmap>, output: &mut RoaringBitmap, ) -> Result<()> { match (left, right) { @@ -321,7 +330,7 @@ impl<'a> Filter<'a> { (_, _) => (), } facet_range_search::find_docids_of_facet_within_bounds::( - rtxn, db, field_id, &left, &right, output, + rtxn, db, field_id, &left, &right, universe, output, )?; Ok(()) @@ -332,15 +341,18 @@ impl<'a> Filter<'a> { rtxn: &heed::RoTxn, index: &Index, filterable_fields: &HashSet, + universe: Option<&RoaringBitmap>, ) -> Result { match &self.condition { FilterCondition::Not(f) => { + // TODO improve the documents_ids to also support intersections at deserialize time. let all_ids = index.documents_ids(rtxn)?; let selected = Self::inner_evaluate( &(f.as_ref().clone()).into(), rtxn, index, filterable_fields, + universe, )?; Ok(all_ids - selected) } @@ -353,7 +365,8 @@ impl<'a> Filter<'a> { for el in els { let op = Condition::Equal(el.clone()); - let el_bitmap = Self::evaluate_operator(rtxn, index, fid, &op)?; + let el_bitmap = + Self::evaluate_operator(rtxn, index, fid, universe, &op)?; bitmap |= el_bitmap; } Ok(bitmap) @@ -371,7 +384,7 @@ impl<'a> Filter<'a> { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; if let Some(fid) = field_ids_map.id(fid.value()) { - Self::evaluate_operator(rtxn, index, fid, op) + Self::evaluate_operator(rtxn, index, fid, universe, op) } else { Ok(RoaringBitmap::new()) } @@ -384,7 +397,8 @@ impl<'a> Filter<'a> { } FilterCondition::Or(subfilters) => subfilters .iter() - .map(|f| Self::inner_evaluate(&(f.clone()).into(), rtxn, index, filterable_fields)) + .cloned() + .map(|f| Self::inner_evaluate(&f.into(), rtxn, index, filterable_fields, universe)) .union(), FilterCondition::And(subfilters) => { let mut subfilters_iter = subfilters.iter(); @@ -394,16 +408,21 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; for f in subfilters_iter { if bitmap.is_empty() { return Ok(bitmap); } + // TODO We are doing the intersections two times, + // it could be more efficient + // Can't I just replace this `&=` by an `=`? bitmap &= Self::inner_evaluate( &(f.clone()).into(), rtxn, index, filterable_fields, + Some(&bitmap), )?; } Ok(bitmap) @@ -503,6 +522,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let geo_lng_token = Token::new( @@ -535,6 +555,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; let condition_right = FilterCondition::Condition { @@ -548,6 +569,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )?; left | right @@ -563,6 +585,7 @@ impl<'a> Filter<'a> { rtxn, index, filterable_fields, + universe, )? }; diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 34a9cdcb8..858028bb5 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -7,7 +7,7 @@ use roaring::RoaringBitmap; pub use self::facet_distribution::{FacetDistribution, OrderBy, DEFAULT_VALUES_PER_FACET}; pub use self::filter::{BadGeoError, Filter}; pub use self::search::{FacetValueHit, SearchForFacetValues}; -use crate::heed_codec::facet::{FacetGroupKeyCodec, FacetGroupValueCodec, OrderedF64Codec}; +use crate::heed_codec::facet::{FacetGroupKeyCodec, OrderedF64Codec}; use crate::heed_codec::BytesRefCodec; use crate::{Index, Result}; @@ -54,9 +54,9 @@ pub fn facet_max_value<'t>( } /// Get the first facet value in the facet database -pub(crate) fn get_first_facet_value<'t, BoundCodec>( +pub(crate) fn get_first_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -78,9 +78,9 @@ where } /// Get the last facet value in the facet database -pub(crate) fn get_last_facet_value<'t, BoundCodec>( +pub(crate) fn get_last_facet_value<'t, BoundCodec, DC>( txn: &'t RoTxn, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result> where @@ -102,9 +102,9 @@ where } /// Get the height of the highest level in the facet database -pub(crate) fn get_highest_level<'t>( +pub(crate) fn get_highest_level<'t, DC>( txn: &'t RoTxn<'t>, - db: heed::Database, FacetGroupValueCodec>, + db: heed::Database, DC>, field_id: u16, ) -> heed::Result { let field_id_prefix = &field_id.to_be_bytes(); From 7967e93c160e88dab7c2cde2f9c3cfe4352e28c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 16:58:32 -0400 Subject: [PATCH 06/13] Skip evaluating when a universe is empty, nothing can be found --- milli/src/search/facet/filter.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index f5fd0f2fd..d75ed5f22 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -343,6 +343,10 @@ impl<'a> Filter<'a> { filterable_fields: &HashSet, universe: Option<&RoaringBitmap>, ) -> Result { + if universe.map_or(false, |u| u.is_empty()) { + return Ok(RoaringBitmap::new()); + } + match &self.condition { FilterCondition::Not(f) => { // TODO improve the documents_ids to also support intersections at deserialize time. From 0a9bd398c7f52bce0b8dde30b9a8ddd7411dbb20 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 16:59:00 -0400 Subject: [PATCH 07/13] Improve the NOT operator to use the universe when possible --- milli/src/search/facet/filter.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d75ed5f22..4570d4ca4 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -349,8 +349,6 @@ impl<'a> Filter<'a> { match &self.condition { FilterCondition::Not(f) => { - // TODO improve the documents_ids to also support intersections at deserialize time. - let all_ids = index.documents_ids(rtxn)?; let selected = Self::inner_evaluate( &(f.as_ref().clone()).into(), rtxn, @@ -358,7 +356,13 @@ impl<'a> Filter<'a> { filterable_fields, universe, )?; - Ok(all_ids - selected) + match universe { + Some(universe) => Ok(universe - selected), + None => { + let all_ids = index.documents_ids(rtxn)?; + Ok(all_ids - selected) + } + } } FilterCondition::In { fid, els } => { if crate::is_faceted(fid.value(), filterable_fields) { From 66470b27e678345b409702267945c9de7893383e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 17:03:57 -0400 Subject: [PATCH 08/13] Use the MultiOps trait for IN operations --- milli/src/search/facet/filter.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index 4570d4ca4..c08abc8e0 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -367,17 +367,11 @@ impl<'a> Filter<'a> { FilterCondition::In { fid, els } => { if crate::is_faceted(fid.value(), filterable_fields) { let field_ids_map = index.fields_ids_map(rtxn)?; - if let Some(fid) = field_ids_map.id(fid.value()) { - let mut bitmap = RoaringBitmap::new(); - - for el in els { - let op = Condition::Equal(el.clone()); - let el_bitmap = - Self::evaluate_operator(rtxn, index, fid, universe, &op)?; - bitmap |= el_bitmap; - } - Ok(bitmap) + els.iter() + .map(|el| Condition::Equal(el.clone())) + .map(|op| Self::evaluate_operator(rtxn, index, fid, universe, &op)) + .union() } else { Ok(RoaringBitmap::new()) } From 5432776132cf5b23032e265bc76b2bf2a04c0c2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 21:30:04 -0400 Subject: [PATCH 09/13] Reduce the universe while exploring the facet tree --- milli/src/search/facet/facet_range_search.rs | 59 +++++++++++++------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 0f8f58771..81fa0ef42 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -50,7 +50,7 @@ where Bound::Unbounded => Bound::Unbounded, }; let db = db.remap_types::, FacetGroupLazyValueCodec>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(starting_left_bound) = @@ -59,7 +59,7 @@ where let rightmost_bound = Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; - f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size, universe)?; Ok(()) } else { Ok(()) @@ -73,14 +73,18 @@ struct FacetRangeSearch<'t, 'b, 'bitmap> { field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, - /// The subset of documents ids that are useful for this search. - /// Great performance optimizations can be achieved by only fetching values matching this subset. - universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { - fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { + fn run_level_0( + &mut self, + starting_left_bound: &'t [u8], + group_size: usize, + // The subset of documents ids that are useful for this search. + // Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&RoaringBitmap>, + ) -> Result<()> { let left_key = FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); @@ -113,7 +117,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= match self.universe { + *self.docids |= match universe { Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( value.bitmap_bytes, universe, @@ -150,9 +154,10 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { starting_left_bound: &'t [u8], rightmost_bound: Bound<&'t [u8]>, group_size: usize, + universe: Option<&RoaringBitmap>, ) -> Result<()> { if level == 0 { - return self.run_level_0(starting_left_bound, group_size); + return self.run_level_0(starting_left_bound, group_size, universe); } let left_key = @@ -209,12 +214,16 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; + let subset = match universe { + Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?), + None => None, + }; if should_take_whole_group { - *self.docids |= match self.universe { - Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?, + *self.docids |= match subset { + Some(subset) => subset, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; previous_key = next_key; @@ -229,7 +238,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let rightmost_bound = Bound::Excluded(next_key.left_bound); let group_size = previous_value.size as usize; - self.run(level, starting_left_bound, rightmost_bound, group_size)?; + if subset.as_ref().map_or(true, |u| !u.is_empty()) { + self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; + } previous_key = next_key; previous_value = next_value; @@ -311,12 +322,18 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; + + let subset = match universe { + Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?), + None => None, + }; + if should_take_whole_group { - *self.docids |= match self.universe { - Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?, + *self.docids |= match subset { + Some(subset) => subset, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; } else { @@ -324,7 +341,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let starting_left_bound = previous_key.left_bound; let group_size = previous_value.size as usize; - self.run(level, starting_left_bound, rightmost_bound, group_size)?; + if subset.as_ref().map_or(true, |u| !u.is_empty()) { + self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; + } } Ok(()) From 52d0d35b39096eb49e67a9801b1edc44993fa2b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 5 Jun 2024 21:58:40 -0400 Subject: [PATCH 10/13] Revert "Reduce the universe while exploring the facet tree" because it's slower this way This reverts commit 14026115f21409535772ede0ee4273f37848dd61. --- milli/src/search/facet/facet_range_search.rs | 59 +++++++------------- 1 file changed, 20 insertions(+), 39 deletions(-) diff --git a/milli/src/search/facet/facet_range_search.rs b/milli/src/search/facet/facet_range_search.rs index 81fa0ef42..0f8f58771 100644 --- a/milli/src/search/facet/facet_range_search.rs +++ b/milli/src/search/facet/facet_range_search.rs @@ -50,7 +50,7 @@ where Bound::Unbounded => Bound::Unbounded, }; let db = db.remap_types::, FacetGroupLazyValueCodec>(); - let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids }; + let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, universe, docids }; let highest_level = get_highest_level(rtxn, db, field_id)?; if let Some(starting_left_bound) = @@ -59,7 +59,7 @@ where let rightmost_bound = Bound::Included(get_last_facet_value::(rtxn, db, field_id)?.unwrap()); // will not fail because get_first_facet_value succeeded let group_size = usize::MAX; - f.run(highest_level, starting_left_bound, rightmost_bound, group_size, universe)?; + f.run(highest_level, starting_left_bound, rightmost_bound, group_size)?; Ok(()) } else { Ok(()) @@ -73,18 +73,14 @@ struct FacetRangeSearch<'t, 'b, 'bitmap> { field_id: u16, left: Bound<&'b [u8]>, right: Bound<&'b [u8]>, + /// The subset of documents ids that are useful for this search. + /// Great performance optimizations can be achieved by only fetching values matching this subset. + universe: Option<&'bitmap RoaringBitmap>, docids: &'bitmap mut RoaringBitmap, } impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { - fn run_level_0( - &mut self, - starting_left_bound: &'t [u8], - group_size: usize, - // The subset of documents ids that are useful for this search. - // Great performance optimizations can be achieved by only fetching values matching this subset. - universe: Option<&RoaringBitmap>, - ) -> Result<()> { + fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> { let left_key = FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound }; let iter = self.db.range(self.rtxn, &(left_key..))?.take(group_size); @@ -117,7 +113,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { } if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) { - *self.docids |= match universe { + *self.docids |= match self.universe { Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( value.bitmap_bytes, universe, @@ -154,10 +150,9 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { starting_left_bound: &'t [u8], rightmost_bound: Bound<&'t [u8]>, group_size: usize, - universe: Option<&RoaringBitmap>, ) -> Result<()> { if level == 0 { - return self.run_level_0(starting_left_bound, group_size, universe); + return self.run_level_0(starting_left_bound, group_size); } let left_key = @@ -214,16 +209,12 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; - let subset = match universe { - Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?), - None => None, - }; if should_take_whole_group { - *self.docids |= match subset { - Some(subset) => subset, + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; previous_key = next_key; @@ -238,9 +229,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let rightmost_bound = Bound::Excluded(next_key.left_bound); let group_size = previous_value.size as usize; - if subset.as_ref().map_or(true, |u| !u.is_empty()) { - self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; - } + self.run(level, starting_left_bound, rightmost_bound, group_size)?; previous_key = next_key; previous_value = next_value; @@ -322,18 +311,12 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { }; left_condition && right_condition }; - - let subset = match universe { - Some(universe) => Some(CboRoaringBitmapCodec::intersection_with_serialized( - previous_value.bitmap_bytes, - universe, - )?), - None => None, - }; - if should_take_whole_group { - *self.docids |= match subset { - Some(subset) => subset, + *self.docids |= match self.universe { + Some(universe) => CboRoaringBitmapCodec::intersection_with_serialized( + previous_value.bitmap_bytes, + universe, + )?, None => CboRoaringBitmapCodec::deserialize_from(previous_value.bitmap_bytes)?, }; } else { @@ -341,9 +324,7 @@ impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> { let starting_left_bound = previous_key.left_bound; let group_size = previous_value.size as usize; - if subset.as_ref().map_or(true, |u| !u.is_empty()) { - self.run(level, starting_left_bound, rightmost_bound, group_size, subset.as_ref())?; - } + self.run(level, starting_left_bound, rightmost_bound, group_size)?; } Ok(()) From 40f05fe15693cb8cfea35af7de887447406b25cd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 10:59:55 -0400 Subject: [PATCH 11/13] Bump roaring to the latest commit --- Cargo.lock | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.lock b/Cargo.lock index e72d72251..88af93bd4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4395,7 +4395,7 @@ dependencies = [ [[package]] name = "roaring" version = "0.10.4" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#4466ae0104ed44a8cf41d187d9359483fe190701" +source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#88b848b84cf7c8cc8d2ea02dfff77b5a54d822ec" dependencies = [ "bytemuck", "byteorder", From 75b2e02cd2e3b46d0f0494c6da822940c9fe98cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 6 Jun 2024 11:00:07 -0400 Subject: [PATCH 12/13] Log more stuff around filtering --- milli/src/search/new/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/search/new/mod.rs b/milli/src/search/new/mod.rs index 623c72567..52eb7ffea 100644 --- a/milli/src/search/new/mod.rs +++ b/milli/src/search/new/mod.rs @@ -548,6 +548,7 @@ fn resolve_sort_criteria<'ctx, Query: RankingRuleQueryTrait>( Ok(()) } +#[tracing::instrument(level = "trace", skip_all, target = "search")] pub fn filtered_universe( index: &Index, txn: &RoTxn<'_>, From 8ec6e175e52c1050566ea4460793f073508b83f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Fri, 7 Jun 2024 22:11:26 -0400 Subject: [PATCH 13/13] Replace roaring patch to the v0.10.5 --- Cargo.lock | 5 +++-- Cargo.toml | 3 --- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 88af93bd4..4417af63a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4394,8 +4394,9 @@ dependencies = [ [[package]] name = "roaring" -version = "0.10.4" -source = "git+https://github.com/RoaringBitmap/roaring-rs?branch=intersection-with-serialized#88b848b84cf7c8cc8d2ea02dfff77b5a54d822ec" +version = "0.10.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7699249cc2c7d71939f30868f47e9d7add0bdc030d90ee10bfd16887ff8bb1c8" dependencies = [ "bytemuck", "byteorder", diff --git a/Cargo.toml b/Cargo.toml index f49d1fd44..5c6c8b376 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -64,6 +64,3 @@ opt-level = 3 opt-level = 3 [profile.bench.package.yada] opt-level = 3 - -[patch.crates-io] -roaring = { git = "https://github.com/RoaringBitmap/roaring-rs", branch = "intersection-with-serialized" }