Start porting facet distribution and sort to new database structure

This commit is contained in:
Loïc Lecrenier 2022-08-30 14:17:40 +02:00 committed by Loïc Lecrenier
parent 7913d6365c
commit 63ef0aba18
7 changed files with 594 additions and 990 deletions

View File

@ -0,0 +1,199 @@
use roaring::RoaringBitmap;
use std::ops::ControlFlow;
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice};
use super::{get_first_facet_value, get_highest_level};
pub fn iterate_over_facet_distribution<'t, CB>(
rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
callback: CB,
) where
CB: FnMut(&'t [u8], u64) -> ControlFlow<()>,
{
let mut fd = FacetDistribution { rtxn, db, field_id, callback };
let highest_level =
get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id);
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id) {
fd.iterate(candidates, highest_level, first_bound, usize::MAX);
return;
} else {
return;
}
}
struct FacetDistribution<'t, CB>
where
CB: FnMut(&'t [u8], u64) -> ControlFlow<()>,
{
rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
callback: CB,
}
impl<'t, CB> FacetDistribution<'t, CB>
where
CB: FnMut(&'t [u8], u64) -> ControlFlow<()>,
{
fn iterate_level_0(
&mut self,
candidates: &RoaringBitmap,
starting_bound: &'t [u8],
group_size: usize,
) -> ControlFlow<()> {
let starting_key =
FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size);
for el in iter {
let (key, value) = el.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id {
return ControlFlow::Break(());
}
let docids_in_common = value.bitmap.intersection_len(candidates);
if docids_in_common > 0 {
match (self.callback)(key.left_bound, docids_in_common) {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return ControlFlow::Break(()),
}
}
}
return ControlFlow::Continue(());
}
fn iterate(
&mut self,
candidates: &RoaringBitmap,
level: u8,
starting_bound: &'t [u8],
group_size: usize,
) -> ControlFlow<()> {
if level == 0 {
return self.iterate_level_0(candidates, starting_bound, group_size);
}
let starting_key = FacetKey { field_id: self.field_id, level, left_bound: starting_bound };
let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size);
for el in iter {
let (key, value) = el.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id {
return ControlFlow::Break(());
}
let docids_in_common = value.bitmap & candidates;
if docids_in_common.len() > 0 {
let cf =
self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize);
match cf {
ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return ControlFlow::Break(()),
}
}
}
return ControlFlow::Continue(());
}
}
#[cfg(test)]
mod tests {
use crate::{codec::U16Codec, Index};
use heed::BytesDecode;
use roaring::RoaringBitmap;
use std::ops::ControlFlow;
use super::iterate_over_facet_distribution;
fn get_simple_index() -> Index<U16Codec> {
let index = Index::<U16Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
index.insert(&mut txn, 0, &i, &bitmap);
}
txn.commit().unwrap();
index
}
fn get_random_looking_index() -> Index<U16Codec> {
let index = Index::<U16Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
let rng = fastrand::Rng::with_seed(0);
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>();
for (_i, key) in keys.into_iter().enumerate() {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
index.insert(&mut txn, 0, &(key as u16), &bitmap);
}
txn.commit().unwrap();
index
}
#[test]
fn random_looking_index_snap() {
let index = get_random_looking_index();
insta::assert_display_snapshot!(index)
}
#[test]
fn filter_distribution_all() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.into_iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
let mut results = String::new();
iterate_over_facet_distribution(
&txn,
&index.db.content,
0,
&candidates,
|facet, count| {
let facet = U16Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {count}\n"));
ControlFlow::Continue(())
},
);
insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results);
txn.commit().unwrap();
}
}
#[test]
fn filter_distribution_all_stop_early() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.into_iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
let mut results = String::new();
let mut nbr_facets = 0;
iterate_over_facet_distribution(
&txn,
&index.db.content,
0,
&candidates,
|facet, count| {
let facet = U16Codec::bytes_decode(facet).unwrap();
if nbr_facets == 100 {
return ControlFlow::Break(());
} else {
nbr_facets += 1;
results.push_str(&format!("{facet}: {count}\n"));
ControlFlow::Continue(())
}
},
);
insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results);
txn.commit().unwrap();
}
}
}

View File

@ -1,335 +0,0 @@
// use std::ops::Bound::{self, Excluded, Included, Unbounded};
// use either::Either::{self, Left, Right};
// use heed::types::{ByteSlice, DecodeIgnore};
// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange};
// use obkv::Key;
// use roaring::RoaringBitmap;
// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec;
// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec};
// use crate::heed_codec::CboRoaringBitmapCodec;
// use crate::{FieldId, Index};
// pub struct FacetNumberRange<'t, 'e> {
// rtxn: &'t heed::RoTxn<'e>,
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
// iter: RoRange<'t, FacetKeyCodec<OrderedF64Codec>, LazyDecode<FacetGroupValueCodec>>,
// max_bound: f64,
// previous: Option<(FacetKey<f64>, Lazy<'t, FacetGroupValueCodec>)>,
// field_id: FieldId,
// end: Bound<f64>,
// }
// impl<'t, 'e> FacetNumberRange<'t, 'e> {
// pub fn new(
// rtxn: &'t heed::RoTxn<'e>,
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
// field_id: FieldId,
// level: u8,
// left: Bound<f64>,
// right: Bound<f64>,
// ) -> heed::Result<FacetNumberRange<'t, 'e>> {
// let left_bound = match left {
// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }),
// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }),
// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }),
// };
// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?;
// let mut previous = iter.next().transpose()?;
// // Compute the maximum end bound by looking at the key of the last element in level 0
// let mut prefix_level_0 = vec![];
// prefix_level_0.extend_from_slice(&field_id.to_be_bytes());
// prefix_level_0.push(level);
// let mut rev_iter =
// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?;
// let rev_iter_first = rev_iter.next().transpose()?;
// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first {
// let max_bound_key =
// FacetKeyCodec::<OrderedF64Codec>::bytes_decode(max_bound_key).unwrap();
// max_bound_key.left_bound
// } else {
// // I can't imagine when that would happen, but let's handle it correctly anyway
// // by making the iterator empty
// previous = None;
// 0.0 // doesn't matter since previous = None so the iterator will always early exit
// // and return None itself
// };
// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right })
// }
// }
// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> {
// type Item = heed::Result<(FacetKey<f64>, RoaringBitmap)>;
// fn next(&mut self) -> Option<Self::Item> {
// // The idea here is to return the **previous** element only if the left
// // bound of the current key fits within the range given to the iter
// // if it doesn't, then there is still a chance that it must be returned,
// // but we need to check the actual right bound of the group by looking for
// // the key preceding the first key of the next group in level 0
// let (prev_key, prev_value) = self.previous?;
// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() {
// let (key, group_value) = match next {
// Ok(n) => n,
// Err(e) => return Some(Err(e)),
// };
// (key.left_bound, Some((key, group_value)))
// } else {
// // we're at the end of the level iter, so we need to fetch the max bound instead
// (self.max_bound, None)
// };
// let must_be_returned = match self.end {
// Included(end) => next_left_bound <= end,
// Excluded(end) => next_left_bound < end,
// Unbounded => true,
// };
// if must_be_returned {
// match prev_value.decode() {
// Ok(group_value) => {
// self.previous = next_previous;
// Some(Ok((prev_key, group_value.bitmap)))
// }
// Err(e) => Some(Err(e)),
// }
// } else {
// // it still possible that we want to return the value (one last time)
// // but to do so, we need to fetch the right bound of the current group
// // this is done by getting the first element at level 0 of the next group
// // then iterating in reverse from it
// // once we have the right bound, we can compare it, and then return or not
// // then we still set self.previous to None so that no other element can return
// // from it?
// let mut level_0_key_prefix = vec![];
// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes());
// level_0_key_prefix.push(0);
// let key =
// FacetKey::<f64> { field_id: self.field_id, level: 0, left_bound: next_left_bound };
// let key_bytes = FacetKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
// level_0_key_prefix.extend_from_slice(&key_bytes);
// let mut rev_iter_next_group_level_0 = self
// .db
// .as_polymorph()
// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix)
// .unwrap();
// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap();
// let key_for_right_bound =
// FacetKeyCodec::<OrderedF64Codec>::bytes_decode(key_for_right_bound).unwrap();
// let right_bound = key_for_right_bound.left_bound;
// let must_be_returned = match self.end {
// Included(end) => right_bound <= end,
// Excluded(end) => right_bound < end,
// Unbounded => unreachable!(),
// };
// self.previous = None;
// if must_be_returned {
// match prev_value.decode() {
// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))),
// Err(e) => Some(Err(e)),
// }
// } else {
// None
// }
// }
// }
// }
// pub struct FacetNumberRevRange<'t> {
// iter: RoRevRange<'t, FacetKeyCodec<OrderedF64Codec>, LazyDecode<FacetGroupValueCodec>>,
// end: Bound<f64>,
// }
// impl<'t> FacetNumberRevRange<'t> {
// pub fn new(
// rtxn: &'t heed::RoTxn,
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
// field_id: FieldId,
// level: u8,
// left: Bound<f64>,
// right: Bound<f64>,
// ) -> heed::Result<FacetNumberRevRange<'t>> {
// let left_bound = match left {
// Included(left) => Included(FacetKey { field_id, level, left_bound: left }),
// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }),
// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }),
// };
// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX });
// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?;
// Ok(FacetNumberRevRange { iter, end: right })
// }
// }
// impl<'t> Iterator for FacetNumberRevRange<'t> {
// type Item = heed::Result<(FacetKey<f64>, RoaringBitmap)>;
// fn next(&mut self) -> Option<Self::Item> {
// loop {
// match self.iter.next() {
// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => {
// let must_be_returned = match self.end {
// Included(end) => todo!(), //right <= end,
// Excluded(end) => todo!(), //right < end,
// Unbounded => true,
// };
// if must_be_returned {
// match docids.decode() {
// Ok(docids) => {
// return Some(Ok((
// FacetKey { field_id, level, left_bound },
// docids.bitmap,
// )))
// }
// Err(e) => return Some(Err(e)),
// }
// }
// continue;
// }
// Some(Err(e)) => return Some(Err(e)),
// None => return None,
// }
// }
// }
// }
// pub struct FacetNumberIter<'t, 'e> {
// rtxn: &'t heed::RoTxn<'t>,
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
// field_id: FieldId,
// level_iters: Vec<(RoaringBitmap, Either<FacetNumberRange<'t, 'e>, FacetNumberRevRange<'t>>)>,
// must_reduce: bool,
// }
// impl<'t, 'e> FacetNumberIter<'t, 'e> {
// /// Create a `FacetNumberIter` that will iterate on the different facet entries
// /// (facet value + documents ids) and that will reduce the given documents ids
// /// while iterating on the different facet levels.
// pub fn new_reducing(
// rtxn: &'t heed::RoTxn<'e>,
// index: &'t Index,
// field_id: FieldId,
// documents_ids: RoaringBitmap,
// ) -> heed::Result<FacetNumberIter<'t, 'e>> {
// let db = index.facet_id_f64_docids;
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
// let highest_iter =
// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
// let level_iters = vec![(documents_ids, Left(highest_iter))];
// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true })
// }
// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse
// /// (facet value + documents ids) and that will reduce the given documents ids
// /// while iterating on the different facet levels.
// pub fn new_reverse_reducing(
// rtxn: &'t heed::RoTxn<'e>,
// index: &'t Index,
// field_id: FieldId,
// documents_ids: RoaringBitmap,
// ) -> heed::Result<FacetNumberIter<'t, 'e>> {
// let db = index.facet_id_f64_docids;
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
// let highest_iter =
// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
// let level_iters = vec![(documents_ids, Right(highest_iter))];
// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true })
// }
// /// Create a `FacetNumberIter` that will iterate on the different facet entries
// /// (facet value + documents ids) and that will not reduce the given documents ids
// /// while iterating on the different facet levels, possibly returning multiple times
// /// a document id associated with multiple facet values.
// pub fn new_non_reducing(
// rtxn: &'t heed::RoTxn<'e>,
// index: &'t Index,
// field_id: FieldId,
// documents_ids: RoaringBitmap,
// ) -> heed::Result<FacetNumberIter<'t, 'e>> {
// let db = index.facet_id_f64_docids;
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
// let highest_iter =
// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
// let level_iters = vec![(documents_ids, Left(highest_iter))];
// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false })
// }
// fn highest_level<X>(
// rtxn: &'t heed::RoTxn,
// db: Database<FacetKeyCodec<OrderedF64Codec>, X>,
// fid: FieldId,
// ) -> heed::Result<Option<u8>> {
// let level = db
// .remap_types::<ByteSlice, DecodeIgnore>()
// .prefix_iter(rtxn, &fid.to_be_bytes())?
// .remap_key_type::<FacetKeyCodec<OrderedF64Codec>>()
// .last()
// .transpose()?
// .map(|(key, _)| key.level);
// Ok(level)
// }
// }
// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> {
// type Item = heed::Result<(f64, RoaringBitmap)>;
// fn next(&mut self) -> Option<Self::Item> {
// 'outer: loop {
// let (documents_ids, last) = self.level_iters.last_mut()?;
// let is_ascending = last.is_left();
// for result in last {
// // If the last iterator must find an empty set of documents it means
// // that we found all the documents in the sub level iterations already,
// // we can pop this level iterator.
// if documents_ids.is_empty() {
// break;
// }
// match result {
// Ok((key, mut docids)) => {
// docids &= &*documents_ids;
// if !docids.is_empty() {
// if self.must_reduce {
// *documents_ids -= &docids;
// }
// if level == 0 {
// return Some(Ok((left, docids)));
// }
// let rtxn = self.rtxn;
// let db = self.db;
// let fid = self.field_id;
// let left = Included(left);
// let right = Included(right);
// let result = if is_ascending {
// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right)
// .map(Left)
// } else {
// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right)
// .map(Right)
// };
// match result {
// Ok(iter) => {
// self.level_iters.push((docids, iter));
// continue 'outer;
// }
// Err(e) => return Some(Err(e)),
// }
// }
// }
// Err(e) => return Some(Err(e)),
// }
// }
// self.level_iters.pop();
// }
// }
// }

View File

@ -0,0 +1,147 @@
use roaring::RoaringBitmap;
use crate::heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
};
use super::{get_first_facet_value, get_highest_level};
pub fn ascending_facet_sort<'t>(
rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
candidates: RoaringBitmap,
) -> Box<dyn Iterator<Item = (&'t [u8], RoaringBitmap)> + 't> {
let highest_level =
get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id);
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(
rtxn,
&db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
field_id,
) {
let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound };
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })
} else {
return Box::new(std::iter::empty());
}
}
struct AscendingFacetSort<'t, 'e> {
rtxn: &'t heed::RoTxn<'e>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
stack: Vec<(
RoaringBitmap,
std::iter::Take<heed::RoRange<'t, FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>>,
)>,
}
impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
type Item = (&'t [u8], RoaringBitmap);
fn next(&mut self) -> Option<Self::Item> {
'outer: loop {
let (documents_ids, deepest_iter) = self.stack.last_mut()?;
for result in deepest_iter {
let (
FacetKey { level, left_bound, field_id },
FacetGroupValue { size: group_size, mut bitmap },
) = result.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if field_id != self.field_id {
return None;
}
// If the last iterator found an empty set of documents it means
// that we found all the documents in the sub level iterations already,
// we can pop this level iterator.
if documents_ids.is_empty() {
break;
}
bitmap &= &*documents_ids;
if !bitmap.is_empty() {
*documents_ids -= &bitmap;
if level == 0 {
return Some((left_bound, bitmap));
}
let starting_key_below =
FacetKey { field_id: self.field_id, level: level - 1, left_bound };
let iter = self
.db
.range(&self.rtxn, &(starting_key_below..))
.unwrap()
.take(group_size as usize);
self.stack.push((bitmap, iter));
continue 'outer;
}
}
self.stack.pop();
}
}
}
#[cfg(test)]
mod tests {
use crate::{
ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index,
};
use heed::BytesDecode;
use roaring::RoaringBitmap;
fn get_simple_index() -> Index<U16Codec> {
let index = Index::<U16Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
index.insert(&mut txn, 0, &i, &bitmap);
}
txn.commit().unwrap();
index
}
fn get_random_looking_index() -> Index<U16Codec> {
let index = Index::<U16Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
let rng = fastrand::Rng::with_seed(0);
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>();
for (_i, key) in keys.into_iter().enumerate() {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
index.insert(&mut txn, 0, &(key as u16), &bitmap);
}
txn.commit().unwrap();
index
}
#[test]
fn random_looking_index_snap() {
let index = get_random_looking_index();
insta::assert_display_snapshot!(index)
}
#[test]
fn filter_sort() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.into_iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let mut results = String::new();
let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates);
for (facet, docids) in iter {
let facet = U16Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results);
txn.commit().unwrap();
}
}
}

View File

@ -0,0 +1,172 @@
use std::ops::Bound;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
};
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
fn descending_facet_sort<'t>(
rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
candidates: RoaringBitmap,
) -> Box<dyn Iterator<Item = (&'t [u8], RoaringBitmap)> + 't> {
let highest_level = get_highest_level(rtxn, db, field_id);
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id) {
let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound };
let last_bound = get_last_facet_value::<MyByteSlice>(rtxn, db, field_id).unwrap();
let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound };
let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX);
Box::new(DescendingFacetSort {
rtxn,
db,
field_id,
stack: vec![(candidates, iter, Bound::Included(last_bound))],
})
} else {
return Box::new(std::iter::empty());
}
}
struct DescendingFacetSort<'t> {
rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
stack: Vec<(
RoaringBitmap,
std::iter::Take<heed::RoRevRange<'t, FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>>,
Bound<&'t [u8]>,
)>,
}
impl<'t> Iterator for DescendingFacetSort<'t> {
type Item = (&'t [u8], RoaringBitmap);
fn next(&mut self) -> Option<Self::Item> {
'outer: loop {
let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?;
while let Some(result) = deepest_iter.next() {
let (
FacetKey { level, left_bound, field_id },
FacetGroupValue { size: group_size, mut bitmap },
) = result.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if field_id != self.field_id {
return None;
}
// If the last iterator found an empty set of documents it means
// that we found all the documents in the sub level iterations already,
// we can pop this level iterator.
if documents_ids.is_empty() {
break;
}
bitmap &= &*documents_ids;
if !bitmap.is_empty() {
*documents_ids -= &bitmap;
if level == 0 {
return Some((left_bound, bitmap));
}
let starting_key_below = FacetKey { field_id, level: level - 1, left_bound };
let end_key_kelow = match *right_bound {
Bound::Included(right) => Bound::Included(FacetKey {
field_id,
level: level - 1,
left_bound: right,
}),
Bound::Excluded(right) => Bound::Excluded(FacetKey {
field_id,
level: level - 1,
left_bound: right,
}),
Bound::Unbounded => Bound::Unbounded,
};
let prev_right_bound = *right_bound;
*right_bound = Bound::Excluded(left_bound);
let iter = self
.db
.rev_range(
&self.rtxn,
&(Bound::Included(starting_key_below), end_key_kelow),
)
.unwrap()
.take(group_size as usize);
self.stack.push((bitmap, iter, prev_right_bound));
continue 'outer;
}
*right_bound = Bound::Excluded(left_bound);
}
self.stack.pop();
}
}
}
#[cfg(test)]
mod tests {
use crate::{
codec::{MyByteSlice, U16Codec},
descending_facet_sort::descending_facet_sort,
display_bitmap, FacetKeyCodec, Index,
};
use heed::BytesDecode;
use roaring::RoaringBitmap;
fn get_simple_index() -> Index<U16Codec> {
let index = Index::<U16Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
index.insert(&mut txn, 0, &i, &bitmap);
}
txn.commit().unwrap();
index
}
fn get_random_looking_index() -> Index<U16Codec> {
let index = Index::<U16Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
let rng = fastrand::Rng::with_seed(0);
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>();
for (_i, key) in keys.into_iter().enumerate() {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
index.insert(&mut txn, 0, &(key as u16), &bitmap);
}
txn.commit().unwrap();
index
}
#[test]
fn random_looking_index_snap() {
let index = get_random_looking_index();
insta::assert_display_snapshot!(index)
}
#[test]
fn filter_sort_descending() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.into_iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
let mut results = String::new();
let db = index.db.content.remap_key_type::<FacetKeyCodec<MyByteSlice>>();
let iter = descending_facet_sort(&txn, &db, 0, candidates);
for (facet, docids) in iter {
let facet = U16Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results);
txn.commit().unwrap();
}
}
}

View File

@ -1,649 +0,0 @@
// //! This module contains helpers iterators for facet strings.
// //!
// //! The purpose is to help iterate over the quite complex system of facets strings. A simple
// //! description of the system would be that every facet string value is stored into an LMDB database
// //! and that every value is associated with the document ids which are associated with this facet
// //! string value.
// //!
// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet
// //! string values, those aggregations helps in choosing the right groups of facets to follow.
// //!
// //! ## A typical algorithm run
// //!
// //! If a group of aggregated facets values contains one of the documents ids, we must continue
// //! iterating over the sub-groups.
// //!
// //! If this group is the lowest level and contain at least one document id we yield the associated
// //! facet documents ids.
// //!
// //! If the group doesn't contain one of our documents ids, we continue to the next group at this
// //! same level.
// //!
// //! ## The complexity comes from the strings
// //!
// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create
// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the
// //! two numbers bounds, the left and the right bound of the group, both inclusive.
// //!
// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and
// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values
// //! are simple unions of the documents ids coming from the groups below.
// //!
// //! ### Example of what a facet number LMDB database contain
// //!
// //! | level | left-bound | right-bound | documents ids |
// //! |-------|------------|-------------|------------------|
// //! | 0 | 0 | _skipped_ | 1, 2 |
// //! | 0 | 1 | _skipped_ | 6, 7 |
// //! | 0 | 3 | _skipped_ | 4, 7 |
// //! | 0 | 5 | _skipped_ | 2, 3, 4 |
// //! | 1 | 0 | 1 | 1, 2, 6, 7 |
// //! | 1 | 3 | 5 | 2, 3, 4, 7 |
// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 |
// //!
// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second
// //! bound, that's the base level where you can directly fetch the documents ids associated with an
// //! exact number.
// //!
// //! The next levels have two different bounds and the associated documents ids are simply the result
// //! of an union of all the documents ids associated with the aggregated groups above.
// //!
// //! ## The complexity of defining groups for facet strings
// //!
// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in
// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw
// //! form and a simple `strcmp` will define the order in which keys will be read from the store.
// //!
// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and
// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the
// //! first number then by the second if the the first number is equal on two keys.
// //!
// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet
// //! strings is different for each facet value.
// //!
// //! ### Basic approach: padding the keys
// //!
// //! A first approach would be to simply define the maximum size of a facet string and pad the keys
// //! with zeroes. The big problem of this approach is that it:
// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the
// //! other.
// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB
// //! performances.
// //!
// //! ### Better approach: number the facet groups
// //!
// //! A better approach would be to number the groups, this way we don't have the downsides of the
// //! previously described approach but we need to be able to describe the groups by using a number.
// //!
// //! #### Example of facet strings with numbered groups
// //!
// //! | level | left-bound | right-bound | left-string | right-string | documents ids |
// //! |-------|------------|-------------|-------------|--------------|------------------|
// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 |
// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 |
// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 |
// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 |
// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 |
// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 |
// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 |
// //!
// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not
// //! need to store the facet string value two times.
// //!
// //! The number in the left-bound and right-bound columns are incremental numbers representing the
// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering
// //! of the LMDB keys.
// //!
// //! In the value, not in the key, you can see that we added two new values: the left-string and the
// //! right-string, which defines the original facet strings associated with the given group.
// //!
// //! We put those two strings inside of the value, this way we do not limit the maximum size of the
// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big
// //! values on another page, this helps in iterating over keys fast enough and only fetch the page
// //! with the values when required.
// //!
// //! The other little advantage with this solution is that there is no a big overhead, compared with
// //! the facet number levels, we only duplicate the facet strings once for the level 1.
// //!
// //! #### A typical algorithm run
// //!
// //! Note that the algorithm is always moving from the highest level to the lowest one, one level
// //! by one level, this is why it is ok to only store the facets string on the level 1.
// //!
// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids,
// //! we must continue iterating over the sub-groups. To do so:
// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds
// //! and iterate over the facet groups defined by these numbers over the current level - 1.
// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the
// //! value and just do the same as with the facet numbers but with strings: iterate over the
// //! current level - 1 with both keys.
// //!
// //! If this group is the lowest level (level 0) and contain at least one document id we yield the
// //! associated facet documents ids.
// //!
// //! If the group doesn't contain one of our documents ids, we continue to the next group at this
// //! same level.
// //!
// use std::num::NonZeroU8;
// use std::ops::Bound;
// use std::ops::Bound::{Excluded, Included, Unbounded};
// use either::{Either, Left, Right};
// use heed::types::{ByteSlice, DecodeIgnore};
// use heed::{Database, LazyDecode, RoRange, RoRevRange};
// use roaring::RoaringBitmap;
// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec;
// use crate::heed_codec::CboRoaringBitmapCodec;
// use crate::{FieldId, Index};
// /// An iterator that is used to explore the facets level strings
// /// from the level 1 to infinity.
// ///
// /// It yields the level, group id that an entry covers, the optional group strings
// /// that it covers of the level 0 only if it is an entry from the level 1 and
// /// the roaring bitmap associated.
// pub struct FacetStringGroupRange<'t> {
// iter: RoRange<
// 't,
// FacetLevelValueU32Codec,
// LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
// >,
// end: Bound<u32>,
// }
// impl<'t> FacetStringGroupRange<'t> {
// pub fn new<X, Y>(
// rtxn: &'t heed::RoTxn,
// db: Database<X, Y>,
// field_id: FieldId,
// level: NonZeroU8,
// left: Bound<u32>,
// right: Bound<u32>,
// ) -> heed::Result<FacetStringGroupRange<'t>> {
// let db = db.remap_types::<
// FacetLevelValueU32Codec,
// FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
// >();
// let left_bound = match left {
// Included(left) => Included((field_id, level, left, u32::MIN)),
// Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
// };
// let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?;
// Ok(FacetStringGroupRange { iter, end: right })
// }
// }
// impl<'t> Iterator for FacetStringGroupRange<'t> {
// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
// fn next(&mut self) -> Option<Self::Item> {
// match self.iter.next() {
// Some(Ok(((_fid, level, left, right), docids))) => {
// let must_be_returned = match self.end {
// Included(end) => right <= end,
// Excluded(end) => right < end,
// Unbounded => true,
// };
// if must_be_returned {
// match docids.decode() {
// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))),
// Err(e) => Some(Err(e)),
// }
// } else {
// None
// }
// }
// Some(Err(e)) => Some(Err(e)),
// None => None,
// }
// }
// }
// pub struct FacetStringGroupRevRange<'t> {
// iter: RoRevRange<
// 't,
// FacetLevelValueU32Codec,
// LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
// >,
// end: Bound<u32>,
// }
// impl<'t> FacetStringGroupRevRange<'t> {
// pub fn new<X, Y>(
// rtxn: &'t heed::RoTxn,
// db: Database<X, Y>,
// field_id: FieldId,
// level: NonZeroU8,
// left: Bound<u32>,
// right: Bound<u32>,
// ) -> heed::Result<FacetStringGroupRevRange<'t>> {
// let db = db.remap_types::<
// FacetLevelValueU32Codec,
// FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
// >();
// let left_bound = match left {
// Included(left) => Included((field_id, level, left, u32::MIN)),
// Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
// };
// let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?;
// Ok(FacetStringGroupRevRange { iter, end: right })
// }
// }
// impl<'t> Iterator for FacetStringGroupRevRange<'t> {
// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
// fn next(&mut self) -> Option<Self::Item> {
// loop {
// match self.iter.next() {
// Some(Ok(((_fid, level, left, right), docids))) => {
// let must_be_returned = match self.end {
// Included(end) => right <= end,
// Excluded(end) => right < end,
// Unbounded => true,
// };
// if must_be_returned {
// match docids.decode() {
// Ok((bounds, docids)) => {
// return Some(Ok(((level, left, right), (bounds, docids))))
// }
// Err(e) => return Some(Err(e)),
// }
// }
// continue;
// }
// Some(Err(e)) => return Some(Err(e)),
// None => return None,
// }
// }
// }
// }
// /// An iterator that is used to explore the level 0 of the facets string database.
// ///
// /// It yields the facet string and the roaring bitmap associated with it.
// pub struct FacetStringLevelZeroRange<'t> {
// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
// }
// impl<'t> FacetStringLevelZeroRange<'t> {
// pub fn new<X, Y>(
// rtxn: &'t heed::RoTxn,
// db: Database<X, Y>,
// field_id: FieldId,
// left: Bound<&str>,
// right: Bound<&str>,
// ) -> heed::Result<FacetStringLevelZeroRange<'t>> {
// fn encode_value<'a>(buffer: &'a mut Vec<u8>, field_id: FieldId, value: &str) -> &'a [u8] {
// buffer.extend_from_slice(&field_id.to_be_bytes());
// buffer.push(0);
// buffer.extend_from_slice(value.as_bytes());
// &buffer[..]
// }
// let mut left_buffer = Vec::new();
// let left_bound = match left {
// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)),
// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)),
// Unbounded => {
// left_buffer.extend_from_slice(&field_id.to_be_bytes());
// left_buffer.push(0);
// Included(&left_buffer[..])
// }
// };
// let mut right_buffer = Vec::new();
// let right_bound = match right {
// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)),
// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)),
// Unbounded => {
// right_buffer.extend_from_slice(&field_id.to_be_bytes());
// right_buffer.push(1); // we must only get the level 0
// Excluded(&right_buffer[..])
// }
// };
// let iter = db
// .remap_key_type::<ByteSlice>()
// .range(rtxn, &(left_bound, right_bound))?
// .remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>();
// Ok(FacetStringLevelZeroRange { iter })
// }
// }
// impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
// fn next(&mut self) -> Option<Self::Item> {
// match self.iter.next() {
// Some(Ok(((_fid, normalized), (original, docids)))) => {
// Some(Ok((normalized, original, docids)))
// }
// Some(Err(e)) => Some(Err(e)),
// None => None,
// }
// }
// }
// pub struct FacetStringLevelZeroRevRange<'t> {
// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
// }
// impl<'t> FacetStringLevelZeroRevRange<'t> {
// pub fn new<X, Y>(
// rtxn: &'t heed::RoTxn,
// db: Database<X, Y>,
// field_id: FieldId,
// left: Bound<&str>,
// right: Bound<&str>,
// ) -> heed::Result<FacetStringLevelZeroRevRange<'t>> {
// fn encode_value<'a>(buffer: &'a mut Vec<u8>, field_id: FieldId, value: &str) -> &'a [u8] {
// buffer.extend_from_slice(&field_id.to_be_bytes());
// buffer.push(0);
// buffer.extend_from_slice(value.as_bytes());
// &buffer[..]
// }
// let mut left_buffer = Vec::new();
// let left_bound = match left {
// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)),
// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)),
// Unbounded => {
// left_buffer.extend_from_slice(&field_id.to_be_bytes());
// left_buffer.push(0);
// Included(&left_buffer[..])
// }
// };
// let mut right_buffer = Vec::new();
// let right_bound = match right {
// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)),
// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)),
// Unbounded => {
// right_buffer.extend_from_slice(&field_id.to_be_bytes());
// right_buffer.push(1); // we must only get the level 0
// Excluded(&right_buffer[..])
// }
// };
// let iter = db
// .remap_key_type::<ByteSlice>()
// .rev_range(rtxn, &(left_bound, right_bound))?
// .remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>();
// Ok(FacetStringLevelZeroRevRange { iter })
// }
// }
// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> {
// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
// fn next(&mut self) -> Option<Self::Item> {
// match self.iter.next() {
// Some(Ok(((_fid, normalized), (original, docids)))) => {
// Some(Ok((normalized, original, docids)))
// }
// Some(Err(e)) => Some(Err(e)),
// None => None,
// }
// }
// }
// type EitherStringRange<'t> = Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>;
// type EitherStringRevRange<'t> =
// Either<FacetStringGroupRevRange<'t>, FacetStringLevelZeroRevRange<'t>>;
// /// An iterator that is used to explore the facet strings level by level,
// /// it will only return facets strings that are associated with the
// /// candidates documents ids given.
// pub struct FacetStringIter<'t> {
// rtxn: &'t heed::RoTxn<'t>,
// db: Database<ByteSlice, ByteSlice>,
// field_id: FieldId,
// level_iters: Vec<(RoaringBitmap, Either<EitherStringRange<'t>, EitherStringRevRange<'t>>)>,
// must_reduce: bool,
// }
// impl<'t> FacetStringIter<'t> {
// pub fn new_reducing(
// rtxn: &'t heed::RoTxn,
// index: &'t Index,
// field_id: FieldId,
// documents_ids: RoaringBitmap,
// ) -> heed::Result<FacetStringIter<'t>> {
// let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?;
// Ok(FacetStringIter {
// rtxn,
// db,
// field_id,
// level_iters: vec![(documents_ids, Left(highest_iter))],
// must_reduce: true,
// })
// }
// pub fn new_reverse_reducing(
// rtxn: &'t heed::RoTxn,
// index: &'t Index,
// field_id: FieldId,
// documents_ids: RoaringBitmap,
// ) -> heed::Result<FacetStringIter<'t>> {
// let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?;
// Ok(FacetStringIter {
// rtxn,
// db,
// field_id,
// level_iters: vec![(documents_ids, Right(highest_reverse_iter))],
// must_reduce: true,
// })
// }
// pub fn new_non_reducing(
// rtxn: &'t heed::RoTxn,
// index: &'t Index,
// field_id: FieldId,
// documents_ids: RoaringBitmap,
// ) -> heed::Result<FacetStringIter<'t>> {
// let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?;
// Ok(FacetStringIter {
// rtxn,
// db,
// field_id,
// level_iters: vec![(documents_ids, Left(highest_iter))],
// must_reduce: false,
// })
// }
// fn highest_level<X, Y>(
// rtxn: &'t heed::RoTxn,
// db: Database<X, Y>,
// fid: FieldId,
// ) -> heed::Result<Option<u8>> {
// Ok(db
// .remap_types::<ByteSlice, DecodeIgnore>()
// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits
// .last()
// .transpose()?
// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit
// }
// fn highest_iter<X, Y>(
// rtxn: &'t heed::RoTxn,
// index: &'t Index,
// db: Database<X, Y>,
// field_id: FieldId,
// ) -> heed::Result<Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>> {
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
// match NonZeroU8::new(highest_level) {
// Some(highest_level) => FacetStringGroupRange::new(
// rtxn,
// index.facet_id_string_docids,
// field_id,
// highest_level,
// Unbounded,
// Unbounded,
// )
// .map(Left),
// None => FacetStringLevelZeroRange::new(
// rtxn,
// index.facet_id_string_docids,
// field_id,
// Unbounded,
// Unbounded,
// )
// .map(Right),
// }
// }
// fn highest_reverse_iter<X, Y>(
// rtxn: &'t heed::RoTxn,
// index: &'t Index,
// db: Database<X, Y>,
// field_id: FieldId,
// ) -> heed::Result<Either<FacetStringGroupRevRange<'t>, FacetStringLevelZeroRevRange<'t>>> {
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
// match NonZeroU8::new(highest_level) {
// Some(highest_level) => FacetStringGroupRevRange::new(
// rtxn,
// index.facet_id_string_docids,
// field_id,
// highest_level,
// Unbounded,
// Unbounded,
// )
// .map(Left),
// None => FacetStringLevelZeroRevRange::new(
// rtxn,
// index.facet_id_string_docids,
// field_id,
// Unbounded,
// Unbounded,
// )
// .map(Right),
// }
// }
// }
// impl<'t> Iterator for FacetStringIter<'t> {
// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
// fn next(&mut self) -> Option<Self::Item> {
// 'outer: loop {
// let (documents_ids, last) = self.level_iters.last_mut()?;
// let is_ascending = last.is_left();
// // We remap the different iterator types to make
// // the algorithm less complex to understand.
// let last = match last {
// Left(ascending) => match ascending {
// Left(group) => Left(Left(group)),
// Right(zero_level) => Right(Left(zero_level)),
// },
// Right(descending) => match descending {
// Left(group) => Left(Right(group)),
// Right(zero_level) => Right(Right(zero_level)),
// },
// };
// match last {
// Left(group) => {
// for result in group {
// match result {
// Ok(((level, left, right), (string_bounds, mut docids))) => {
// docids &= &*documents_ids;
// if !docids.is_empty() {
// if self.must_reduce {
// *documents_ids -= &docids;
// }
// let result = if is_ascending {
// match string_bounds {
// Some((left, right)) => FacetStringLevelZeroRange::new(
// self.rtxn,
// self.db,
// self.field_id,
// Included(left),
// Included(right),
// )
// .map(Right),
// None => FacetStringGroupRange::new(
// self.rtxn,
// self.db,
// self.field_id,
// NonZeroU8::new(level.get() - 1).unwrap(),
// Included(left),
// Included(right),
// )
// .map(Left),
// }
// .map(Left)
// } else {
// match string_bounds {
// Some((left, right)) => {
// FacetStringLevelZeroRevRange::new(
// self.rtxn,
// self.db,
// self.field_id,
// Included(left),
// Included(right),
// )
// .map(Right)
// }
// None => FacetStringGroupRevRange::new(
// self.rtxn,
// self.db,
// self.field_id,
// NonZeroU8::new(level.get() - 1).unwrap(),
// Included(left),
// Included(right),
// )
// .map(Left),
// }
// .map(Right)
// };
// match result {
// Ok(iter) => {
// self.level_iters.push((docids, iter));
// continue 'outer;
// }
// Err(e) => return Some(Err(e)),
// }
// }
// }
// Err(e) => return Some(Err(e)),
// }
// }
// }
// Right(zero_level) => {
// // level zero only
// for result in zero_level {
// match result {
// Ok((normalized, original, mut docids)) => {
// docids &= &*documents_ids;
// if !docids.is_empty() {
// if self.must_reduce {
// *documents_ids -= &docids;
// }
// return Some(Ok((normalized, original, docids)));
// }
// }
// Err(e) => return Some(Err(e)),
// }
// }
// }
// }
// self.level_iters.pop();
// }
// }
// }

View File

@ -1,9 +1,79 @@
use heed::types::ByteSlice;
use heed::{BytesDecode, RoTxn};
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice};
pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET}; pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; // pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange};
// pub use self::facet_string::FacetStringIter; // pub use self::facet_string::FacetStringIter;
pub use self::filter::Filter; pub use self::filter::Filter;
mod facet_distribution; mod facet_distribution;
mod facet_number; mod facet_distribution_iter;
mod facet_string; mod facet_sort_ascending;
mod facet_sort_descending;
mod filter; mod filter;
fn get_first_facet_value<'t, BoundCodec>(
txn: &'t RoTxn,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
) -> Option<BoundCodec::DItem>
where
BoundCodec: BytesDecode<'t>,
{
let mut level0prefix = vec![];
level0prefix.extend_from_slice(&field_id.to_be_bytes());
level0prefix.push(0);
let mut level0_iter_forward = db
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())
.unwrap();
if let Some(first) = level0_iter_forward.next() {
let (first_key, _) = first.unwrap();
let first_key = FacetKeyCodec::<BoundCodec>::bytes_decode(first_key).unwrap();
Some(first_key.left_bound)
} else {
None
}
}
fn get_last_facet_value<'t, BoundCodec>(
txn: &'t RoTxn,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
) -> Option<BoundCodec::DItem>
where
BoundCodec: BytesDecode<'t>,
{
let mut level0prefix = vec![];
level0prefix.extend_from_slice(&field_id.to_be_bytes());
level0prefix.push(0);
let mut level0_iter_backward = db
.as_polymorph()
.rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())
.unwrap();
if let Some(last) = level0_iter_backward.next() {
let (last_key, _) = last.unwrap();
let last_key = FacetKeyCodec::<BoundCodec>::bytes_decode(last_key).unwrap();
Some(last_key.left_bound)
} else {
None
}
}
fn get_highest_level<'t>(
txn: &'t RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
) -> u8 {
let field_id_prefix = &field_id.to_be_bytes();
db.as_polymorph()
.rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix)
.unwrap()
.next()
.map(|el| {
let (key, _) = el.unwrap();
let key = FacetKeyCodec::<MyByteSlice>::bytes_decode(key).unwrap();
key.level
})
.unwrap_or(0)
}

View File

@ -64,7 +64,7 @@ impl<'i> Facets<'i> {
} }
#[logging_timer::time("Facets::{}")] #[logging_timer::time("Facets::{}")]
pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> { pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?; self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
// We get the faceted fields to be able to create the facet levels. // We get the faceted fields to be able to create the facet levels.
let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone(); let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone();
@ -172,14 +172,14 @@ impl<'t> CreateFacetsAlgo<'t> {
bitmaps.push(docids); bitmaps.push(docids);
if bitmaps.len() == self.level_group_size { if bitmaps.len() == self.level_group_size {
handle_group(&bitmaps, left_bound); handle_group(&bitmaps, left_bound)?;
first_iteration_for_new_group = true; first_iteration_for_new_group = true;
bitmaps.clear(); bitmaps.clear();
} }
} }
// don't forget to give the leftover bitmaps as well // don't forget to give the leftover bitmaps as well
if !bitmaps.is_empty() { if !bitmaps.is_empty() {
handle_group(&bitmaps, left_bound); handle_group(&bitmaps, left_bound)?;
bitmaps.clear(); bitmaps.clear();
} }
Ok(()) Ok(())
@ -197,7 +197,7 @@ impl<'t> CreateFacetsAlgo<'t> {
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>, handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
) -> Result<Vec<grenad::Reader<File>>> { ) -> Result<Vec<grenad::Reader<File>>> {
if level == 0 { if level == 0 {
self.read_level_0(handle_group); self.read_level_0(handle_group)?;
// Level 0 is already in the database // Level 0 is already in the database
return Ok(vec![]); return Ok(vec![]);
} }