mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
Start porting facet distribution and sort to new database structure
This commit is contained in:
parent
7913d6365c
commit
63ef0aba18
199
milli/src/search/facet/facet_distribution_iter.rs
Normal file
199
milli/src/search/facet/facet_distribution_iter.rs
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use std::ops::ControlFlow;
|
||||||
|
|
||||||
|
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice};
|
||||||
|
|
||||||
|
use super::{get_first_facet_value, get_highest_level};
|
||||||
|
|
||||||
|
pub fn iterate_over_facet_distribution<'t, CB>(
|
||||||
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
candidates: &RoaringBitmap,
|
||||||
|
callback: CB,
|
||||||
|
) where
|
||||||
|
CB: FnMut(&'t [u8], u64) -> ControlFlow<()>,
|
||||||
|
{
|
||||||
|
let mut fd = FacetDistribution { rtxn, db, field_id, callback };
|
||||||
|
let highest_level =
|
||||||
|
get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id);
|
||||||
|
|
||||||
|
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id) {
|
||||||
|
fd.iterate(candidates, highest_level, first_bound, usize::MAX);
|
||||||
|
return;
|
||||||
|
} else {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FacetDistribution<'t, CB>
|
||||||
|
where
|
||||||
|
CB: FnMut(&'t [u8], u64) -> ControlFlow<()>,
|
||||||
|
{
|
||||||
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
callback: CB,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, CB> FacetDistribution<'t, CB>
|
||||||
|
where
|
||||||
|
CB: FnMut(&'t [u8], u64) -> ControlFlow<()>,
|
||||||
|
{
|
||||||
|
fn iterate_level_0(
|
||||||
|
&mut self,
|
||||||
|
candidates: &RoaringBitmap,
|
||||||
|
starting_bound: &'t [u8],
|
||||||
|
group_size: usize,
|
||||||
|
) -> ControlFlow<()> {
|
||||||
|
let starting_key =
|
||||||
|
FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
|
||||||
|
let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size);
|
||||||
|
for el in iter {
|
||||||
|
let (key, value) = el.unwrap();
|
||||||
|
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||||
|
// so we need to check that we are not iterating over the next field id
|
||||||
|
if key.field_id != self.field_id {
|
||||||
|
return ControlFlow::Break(());
|
||||||
|
}
|
||||||
|
let docids_in_common = value.bitmap.intersection_len(candidates);
|
||||||
|
if docids_in_common > 0 {
|
||||||
|
match (self.callback)(key.left_bound, docids_in_common) {
|
||||||
|
ControlFlow::Continue(_) => {}
|
||||||
|
ControlFlow::Break(_) => return ControlFlow::Break(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return ControlFlow::Continue(());
|
||||||
|
}
|
||||||
|
fn iterate(
|
||||||
|
&mut self,
|
||||||
|
candidates: &RoaringBitmap,
|
||||||
|
level: u8,
|
||||||
|
starting_bound: &'t [u8],
|
||||||
|
group_size: usize,
|
||||||
|
) -> ControlFlow<()> {
|
||||||
|
if level == 0 {
|
||||||
|
return self.iterate_level_0(candidates, starting_bound, group_size);
|
||||||
|
}
|
||||||
|
let starting_key = FacetKey { field_id: self.field_id, level, left_bound: starting_bound };
|
||||||
|
let iter = self.db.range(&self.rtxn, &(&starting_key..)).unwrap().take(group_size);
|
||||||
|
|
||||||
|
for el in iter {
|
||||||
|
let (key, value) = el.unwrap();
|
||||||
|
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||||
|
// so we need to check that we are not iterating over the next field id
|
||||||
|
if key.field_id != self.field_id {
|
||||||
|
return ControlFlow::Break(());
|
||||||
|
}
|
||||||
|
let docids_in_common = value.bitmap & candidates;
|
||||||
|
if docids_in_common.len() > 0 {
|
||||||
|
let cf =
|
||||||
|
self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize);
|
||||||
|
match cf {
|
||||||
|
ControlFlow::Continue(_) => {}
|
||||||
|
ControlFlow::Break(_) => return ControlFlow::Break(()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return ControlFlow::Continue(());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::{codec::U16Codec, Index};
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
use std::ops::ControlFlow;
|
||||||
|
|
||||||
|
use super::iterate_over_facet_distribution;
|
||||||
|
|
||||||
|
fn get_simple_index() -> Index<U16Codec> {
|
||||||
|
let index = Index::<U16Codec>::new(4, 8);
|
||||||
|
let mut txn = index.env.write_txn().unwrap();
|
||||||
|
for i in 0..256u16 {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
bitmap.insert(i as u32);
|
||||||
|
index.insert(&mut txn, 0, &i, &bitmap);
|
||||||
|
}
|
||||||
|
txn.commit().unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
fn get_random_looking_index() -> Index<U16Codec> {
|
||||||
|
let index = Index::<U16Codec>::new(4, 8);
|
||||||
|
let mut txn = index.env.write_txn().unwrap();
|
||||||
|
|
||||||
|
let rng = fastrand::Rng::with_seed(0);
|
||||||
|
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>();
|
||||||
|
|
||||||
|
for (_i, key) in keys.into_iter().enumerate() {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
bitmap.insert(key);
|
||||||
|
bitmap.insert(key + 100);
|
||||||
|
index.insert(&mut txn, 0, &(key as u16), &bitmap);
|
||||||
|
}
|
||||||
|
txn.commit().unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn random_looking_index_snap() {
|
||||||
|
let index = get_random_looking_index();
|
||||||
|
insta::assert_display_snapshot!(index)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn filter_distribution_all() {
|
||||||
|
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||||
|
for (i, index) in indexes.into_iter().enumerate() {
|
||||||
|
let txn = index.env.read_txn().unwrap();
|
||||||
|
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
|
||||||
|
let mut results = String::new();
|
||||||
|
iterate_over_facet_distribution(
|
||||||
|
&txn,
|
||||||
|
&index.db.content,
|
||||||
|
0,
|
||||||
|
&candidates,
|
||||||
|
|facet, count| {
|
||||||
|
let facet = U16Codec::bytes_decode(facet).unwrap();
|
||||||
|
results.push_str(&format!("{facet}: {count}\n"));
|
||||||
|
ControlFlow::Continue(())
|
||||||
|
},
|
||||||
|
);
|
||||||
|
insta::assert_snapshot!(format!("filter_distribution_{i}_all"), results);
|
||||||
|
|
||||||
|
txn.commit().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn filter_distribution_all_stop_early() {
|
||||||
|
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||||
|
for (i, index) in indexes.into_iter().enumerate() {
|
||||||
|
let txn = index.env.read_txn().unwrap();
|
||||||
|
let candidates = (0..=255).into_iter().collect::<RoaringBitmap>();
|
||||||
|
let mut results = String::new();
|
||||||
|
let mut nbr_facets = 0;
|
||||||
|
iterate_over_facet_distribution(
|
||||||
|
&txn,
|
||||||
|
&index.db.content,
|
||||||
|
0,
|
||||||
|
&candidates,
|
||||||
|
|facet, count| {
|
||||||
|
let facet = U16Codec::bytes_decode(facet).unwrap();
|
||||||
|
if nbr_facets == 100 {
|
||||||
|
return ControlFlow::Break(());
|
||||||
|
} else {
|
||||||
|
nbr_facets += 1;
|
||||||
|
results.push_str(&format!("{facet}: {count}\n"));
|
||||||
|
|
||||||
|
ControlFlow::Continue(())
|
||||||
|
}
|
||||||
|
},
|
||||||
|
);
|
||||||
|
insta::assert_snapshot!(format!("filter_distribution_{i}_all_stop_early"), results);
|
||||||
|
|
||||||
|
txn.commit().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,335 +0,0 @@
|
|||||||
// use std::ops::Bound::{self, Excluded, Included, Unbounded};
|
|
||||||
|
|
||||||
// use either::Either::{self, Left, Right};
|
|
||||||
// use heed::types::{ByteSlice, DecodeIgnore};
|
|
||||||
// use heed::{BytesDecode, BytesEncode, Database, Lazy, LazyDecode, RoRange, RoRevRange};
|
|
||||||
// use obkv::Key;
|
|
||||||
// use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
// use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec;
|
|
||||||
// use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec};
|
|
||||||
// use crate::heed_codec::CboRoaringBitmapCodec;
|
|
||||||
// use crate::{FieldId, Index};
|
|
||||||
|
|
||||||
// pub struct FacetNumberRange<'t, 'e> {
|
|
||||||
// rtxn: &'t heed::RoTxn<'e>,
|
|
||||||
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
|
||||||
// iter: RoRange<'t, FacetKeyCodec<OrderedF64Codec>, LazyDecode<FacetGroupValueCodec>>,
|
|
||||||
// max_bound: f64,
|
|
||||||
// previous: Option<(FacetKey<f64>, Lazy<'t, FacetGroupValueCodec>)>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// end: Bound<f64>,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t, 'e> FacetNumberRange<'t, 'e> {
|
|
||||||
// pub fn new(
|
|
||||||
// rtxn: &'t heed::RoTxn<'e>,
|
|
||||||
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// level: u8,
|
|
||||||
// left: Bound<f64>,
|
|
||||||
// right: Bound<f64>,
|
|
||||||
// ) -> heed::Result<FacetNumberRange<'t, 'e>> {
|
|
||||||
// let left_bound = match left {
|
|
||||||
// Included(left_bound) => Included(FacetKey { field_id, level, left_bound }),
|
|
||||||
// Excluded(left_bound) => Excluded(FacetKey { field_id, level, left_bound }),
|
|
||||||
// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }),
|
|
||||||
// };
|
|
||||||
|
|
||||||
// let mut iter = db.lazily_decode_data().range(rtxn, &(left_bound, Unbounded))?;
|
|
||||||
// let mut previous = iter.next().transpose()?;
|
|
||||||
|
|
||||||
// // Compute the maximum end bound by looking at the key of the last element in level 0
|
|
||||||
// let mut prefix_level_0 = vec![];
|
|
||||||
// prefix_level_0.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
// prefix_level_0.push(level);
|
|
||||||
|
|
||||||
// let mut rev_iter =
|
|
||||||
// db.as_polymorph().rev_prefix_iter::<_, ByteSlice, ByteSlice>(rtxn, &prefix_level_0)?;
|
|
||||||
|
|
||||||
// let rev_iter_first = rev_iter.next().transpose()?;
|
|
||||||
// let max_bound = if let Some((max_bound_key, _)) = rev_iter_first {
|
|
||||||
// let max_bound_key =
|
|
||||||
// FacetKeyCodec::<OrderedF64Codec>::bytes_decode(max_bound_key).unwrap();
|
|
||||||
// max_bound_key.left_bound
|
|
||||||
// } else {
|
|
||||||
// // I can't imagine when that would happen, but let's handle it correctly anyway
|
|
||||||
// // by making the iterator empty
|
|
||||||
// previous = None;
|
|
||||||
// 0.0 // doesn't matter since previous = None so the iterator will always early exit
|
|
||||||
// // and return None itself
|
|
||||||
// };
|
|
||||||
|
|
||||||
// Ok(FacetNumberRange { rtxn, db, iter, field_id, previous, max_bound, end: right })
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t, 'e> Iterator for FacetNumberRange<'t, 'e> {
|
|
||||||
// type Item = heed::Result<(FacetKey<f64>, RoaringBitmap)>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// // The idea here is to return the **previous** element only if the left
|
|
||||||
// // bound of the current key fits within the range given to the iter
|
|
||||||
// // if it doesn't, then there is still a chance that it must be returned,
|
|
||||||
// // but we need to check the actual right bound of the group by looking for
|
|
||||||
// // the key preceding the first key of the next group in level 0
|
|
||||||
|
|
||||||
// let (prev_key, prev_value) = self.previous?;
|
|
||||||
|
|
||||||
// let (next_left_bound, next_previous) = if let Some(next) = self.iter.next() {
|
|
||||||
// let (key, group_value) = match next {
|
|
||||||
// Ok(n) => n,
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// };
|
|
||||||
// (key.left_bound, Some((key, group_value)))
|
|
||||||
// } else {
|
|
||||||
// // we're at the end of the level iter, so we need to fetch the max bound instead
|
|
||||||
// (self.max_bound, None)
|
|
||||||
// };
|
|
||||||
// let must_be_returned = match self.end {
|
|
||||||
// Included(end) => next_left_bound <= end,
|
|
||||||
// Excluded(end) => next_left_bound < end,
|
|
||||||
// Unbounded => true,
|
|
||||||
// };
|
|
||||||
// if must_be_returned {
|
|
||||||
// match prev_value.decode() {
|
|
||||||
// Ok(group_value) => {
|
|
||||||
// self.previous = next_previous;
|
|
||||||
// Some(Ok((prev_key, group_value.bitmap)))
|
|
||||||
// }
|
|
||||||
// Err(e) => Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// } else {
|
|
||||||
// // it still possible that we want to return the value (one last time)
|
|
||||||
// // but to do so, we need to fetch the right bound of the current group
|
|
||||||
// // this is done by getting the first element at level 0 of the next group
|
|
||||||
// // then iterating in reverse from it
|
|
||||||
// // once we have the right bound, we can compare it, and then return or not
|
|
||||||
// // then we still set self.previous to None so that no other element can return
|
|
||||||
// // from it?
|
|
||||||
// let mut level_0_key_prefix = vec![];
|
|
||||||
// level_0_key_prefix.extend_from_slice(&self.field_id.to_be_bytes());
|
|
||||||
// level_0_key_prefix.push(0);
|
|
||||||
// let key =
|
|
||||||
// FacetKey::<f64> { field_id: self.field_id, level: 0, left_bound: next_left_bound };
|
|
||||||
// let key_bytes = FacetKeyCodec::<OrderedF64Codec>::bytes_encode(&key).unwrap();
|
|
||||||
// level_0_key_prefix.extend_from_slice(&key_bytes);
|
|
||||||
|
|
||||||
// let mut rev_iter_next_group_level_0 = self
|
|
||||||
// .db
|
|
||||||
// .as_polymorph()
|
|
||||||
// .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&self.rtxn, &level_0_key_prefix)
|
|
||||||
// .unwrap();
|
|
||||||
// let (key_for_right_bound, _) = rev_iter_next_group_level_0.next().unwrap().unwrap();
|
|
||||||
// let key_for_right_bound =
|
|
||||||
// FacetKeyCodec::<OrderedF64Codec>::bytes_decode(key_for_right_bound).unwrap();
|
|
||||||
// let right_bound = key_for_right_bound.left_bound;
|
|
||||||
// let must_be_returned = match self.end {
|
|
||||||
// Included(end) => right_bound <= end,
|
|
||||||
// Excluded(end) => right_bound < end,
|
|
||||||
// Unbounded => unreachable!(),
|
|
||||||
// };
|
|
||||||
// self.previous = None;
|
|
||||||
// if must_be_returned {
|
|
||||||
// match prev_value.decode() {
|
|
||||||
// Ok(group_value) => Some(Ok((prev_key, group_value.bitmap))),
|
|
||||||
// Err(e) => Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// } else {
|
|
||||||
// None
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub struct FacetNumberRevRange<'t> {
|
|
||||||
// iter: RoRevRange<'t, FacetKeyCodec<OrderedF64Codec>, LazyDecode<FacetGroupValueCodec>>,
|
|
||||||
// end: Bound<f64>,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> FacetNumberRevRange<'t> {
|
|
||||||
// pub fn new(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// level: u8,
|
|
||||||
// left: Bound<f64>,
|
|
||||||
// right: Bound<f64>,
|
|
||||||
// ) -> heed::Result<FacetNumberRevRange<'t>> {
|
|
||||||
// let left_bound = match left {
|
|
||||||
// Included(left) => Included(FacetKey { field_id, level, left_bound: left }),
|
|
||||||
// Excluded(left) => Excluded(FacetKey { field_id, level, left_bound: left }),
|
|
||||||
// Unbounded => Included(FacetKey { field_id, level, left_bound: f64::MIN }),
|
|
||||||
// };
|
|
||||||
// let right_bound = Included(FacetKey { field_id, level, left_bound: f64::MAX });
|
|
||||||
// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?;
|
|
||||||
// Ok(FacetNumberRevRange { iter, end: right })
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> Iterator for FacetNumberRevRange<'t> {
|
|
||||||
// type Item = heed::Result<(FacetKey<f64>, RoaringBitmap)>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// loop {
|
|
||||||
// match self.iter.next() {
|
|
||||||
// Some(Ok((FacetKey { field_id, level, left_bound }, docids))) => {
|
|
||||||
// let must_be_returned = match self.end {
|
|
||||||
// Included(end) => todo!(), //right <= end,
|
|
||||||
// Excluded(end) => todo!(), //right < end,
|
|
||||||
// Unbounded => true,
|
|
||||||
// };
|
|
||||||
// if must_be_returned {
|
|
||||||
// match docids.decode() {
|
|
||||||
// Ok(docids) => {
|
|
||||||
// return Some(Ok((
|
|
||||||
// FacetKey { field_id, level, left_bound },
|
|
||||||
// docids.bitmap,
|
|
||||||
// )))
|
|
||||||
// }
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// continue;
|
|
||||||
// }
|
|
||||||
// Some(Err(e)) => return Some(Err(e)),
|
|
||||||
// None => return None,
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub struct FacetNumberIter<'t, 'e> {
|
|
||||||
// rtxn: &'t heed::RoTxn<'t>,
|
|
||||||
// db: Database<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// level_iters: Vec<(RoaringBitmap, Either<FacetNumberRange<'t, 'e>, FacetNumberRevRange<'t>>)>,
|
|
||||||
// must_reduce: bool,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t, 'e> FacetNumberIter<'t, 'e> {
|
|
||||||
// /// Create a `FacetNumberIter` that will iterate on the different facet entries
|
|
||||||
// /// (facet value + documents ids) and that will reduce the given documents ids
|
|
||||||
// /// while iterating on the different facet levels.
|
|
||||||
// pub fn new_reducing(
|
|
||||||
// rtxn: &'t heed::RoTxn<'e>,
|
|
||||||
// index: &'t Index,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// documents_ids: RoaringBitmap,
|
|
||||||
// ) -> heed::Result<FacetNumberIter<'t, 'e>> {
|
|
||||||
// let db = index.facet_id_f64_docids;
|
|
||||||
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
|
||||||
// let highest_iter =
|
|
||||||
// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
|
||||||
// let level_iters = vec![(documents_ids, Left(highest_iter))];
|
|
||||||
// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
|
||||||
// }
|
|
||||||
|
|
||||||
// /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse
|
|
||||||
// /// (facet value + documents ids) and that will reduce the given documents ids
|
|
||||||
// /// while iterating on the different facet levels.
|
|
||||||
// pub fn new_reverse_reducing(
|
|
||||||
// rtxn: &'t heed::RoTxn<'e>,
|
|
||||||
// index: &'t Index,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// documents_ids: RoaringBitmap,
|
|
||||||
// ) -> heed::Result<FacetNumberIter<'t, 'e>> {
|
|
||||||
// let db = index.facet_id_f64_docids;
|
|
||||||
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
|
||||||
// let highest_iter =
|
|
||||||
// FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
|
||||||
// let level_iters = vec![(documents_ids, Right(highest_iter))];
|
|
||||||
// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true })
|
|
||||||
// }
|
|
||||||
|
|
||||||
// /// Create a `FacetNumberIter` that will iterate on the different facet entries
|
|
||||||
// /// (facet value + documents ids) and that will not reduce the given documents ids
|
|
||||||
// /// while iterating on the different facet levels, possibly returning multiple times
|
|
||||||
// /// a document id associated with multiple facet values.
|
|
||||||
// pub fn new_non_reducing(
|
|
||||||
// rtxn: &'t heed::RoTxn<'e>,
|
|
||||||
// index: &'t Index,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// documents_ids: RoaringBitmap,
|
|
||||||
// ) -> heed::Result<FacetNumberIter<'t, 'e>> {
|
|
||||||
// let db = index.facet_id_f64_docids;
|
|
||||||
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
|
||||||
// let highest_iter =
|
|
||||||
// FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?;
|
|
||||||
// let level_iters = vec![(documents_ids, Left(highest_iter))];
|
|
||||||
// Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false })
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn highest_level<X>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// db: Database<FacetKeyCodec<OrderedF64Codec>, X>,
|
|
||||||
// fid: FieldId,
|
|
||||||
// ) -> heed::Result<Option<u8>> {
|
|
||||||
// let level = db
|
|
||||||
// .remap_types::<ByteSlice, DecodeIgnore>()
|
|
||||||
// .prefix_iter(rtxn, &fid.to_be_bytes())?
|
|
||||||
// .remap_key_type::<FacetKeyCodec<OrderedF64Codec>>()
|
|
||||||
// .last()
|
|
||||||
// .transpose()?
|
|
||||||
// .map(|(key, _)| key.level);
|
|
||||||
// Ok(level)
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t, 'e> Iterator for FacetNumberIter<'t, 'e> {
|
|
||||||
// type Item = heed::Result<(f64, RoaringBitmap)>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// 'outer: loop {
|
|
||||||
// let (documents_ids, last) = self.level_iters.last_mut()?;
|
|
||||||
// let is_ascending = last.is_left();
|
|
||||||
// for result in last {
|
|
||||||
// // If the last iterator must find an empty set of documents it means
|
|
||||||
// // that we found all the documents in the sub level iterations already,
|
|
||||||
// // we can pop this level iterator.
|
|
||||||
// if documents_ids.is_empty() {
|
|
||||||
// break;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// match result {
|
|
||||||
// Ok((key, mut docids)) => {
|
|
||||||
// docids &= &*documents_ids;
|
|
||||||
// if !docids.is_empty() {
|
|
||||||
// if self.must_reduce {
|
|
||||||
// *documents_ids -= &docids;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// if level == 0 {
|
|
||||||
// return Some(Ok((left, docids)));
|
|
||||||
// }
|
|
||||||
|
|
||||||
// let rtxn = self.rtxn;
|
|
||||||
// let db = self.db;
|
|
||||||
// let fid = self.field_id;
|
|
||||||
// let left = Included(left);
|
|
||||||
// let right = Included(right);
|
|
||||||
|
|
||||||
// let result = if is_ascending {
|
|
||||||
// FacetNumberRange::new(rtxn, db, fid, level - 1, left, right)
|
|
||||||
// .map(Left)
|
|
||||||
// } else {
|
|
||||||
// FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right)
|
|
||||||
// .map(Right)
|
|
||||||
// };
|
|
||||||
|
|
||||||
// match result {
|
|
||||||
// Ok(iter) => {
|
|
||||||
// self.level_iters.push((docids, iter));
|
|
||||||
// continue 'outer;
|
|
||||||
// }
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// self.level_iters.pop();
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
147
milli/src/search/facet/facet_sort_ascending.rs
Normal file
147
milli/src/search/facet/facet_sort_ascending.rs
Normal file
@ -0,0 +1,147 @@
|
|||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::heed_codec::facet::new::{
|
||||||
|
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{get_first_facet_value, get_highest_level};
|
||||||
|
|
||||||
|
pub fn ascending_facet_sort<'t>(
|
||||||
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
candidates: RoaringBitmap,
|
||||||
|
) -> Box<dyn Iterator<Item = (&'t [u8], RoaringBitmap)> + 't> {
|
||||||
|
let highest_level =
|
||||||
|
get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id);
|
||||||
|
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(
|
||||||
|
rtxn,
|
||||||
|
&db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
|
||||||
|
field_id,
|
||||||
|
) {
|
||||||
|
let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound };
|
||||||
|
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
|
||||||
|
|
||||||
|
Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] })
|
||||||
|
} else {
|
||||||
|
return Box::new(std::iter::empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct AscendingFacetSort<'t, 'e> {
|
||||||
|
rtxn: &'t heed::RoTxn<'e>,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
stack: Vec<(
|
||||||
|
RoaringBitmap,
|
||||||
|
std::iter::Take<heed::RoRange<'t, FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>>,
|
||||||
|
)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
|
||||||
|
type Item = (&'t [u8], RoaringBitmap);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
'outer: loop {
|
||||||
|
let (documents_ids, deepest_iter) = self.stack.last_mut()?;
|
||||||
|
for result in deepest_iter {
|
||||||
|
let (
|
||||||
|
FacetKey { level, left_bound, field_id },
|
||||||
|
FacetGroupValue { size: group_size, mut bitmap },
|
||||||
|
) = result.unwrap();
|
||||||
|
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||||
|
// so we need to check that we are not iterating over the next field id
|
||||||
|
if field_id != self.field_id {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// If the last iterator found an empty set of documents it means
|
||||||
|
// that we found all the documents in the sub level iterations already,
|
||||||
|
// we can pop this level iterator.
|
||||||
|
if documents_ids.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
bitmap &= &*documents_ids;
|
||||||
|
if !bitmap.is_empty() {
|
||||||
|
*documents_ids -= &bitmap;
|
||||||
|
|
||||||
|
if level == 0 {
|
||||||
|
return Some((left_bound, bitmap));
|
||||||
|
}
|
||||||
|
let starting_key_below =
|
||||||
|
FacetKey { field_id: self.field_id, level: level - 1, left_bound };
|
||||||
|
let iter = self
|
||||||
|
.db
|
||||||
|
.range(&self.rtxn, &(starting_key_below..))
|
||||||
|
.unwrap()
|
||||||
|
.take(group_size as usize);
|
||||||
|
|
||||||
|
self.stack.push((bitmap, iter));
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
self.stack.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::{
|
||||||
|
ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index,
|
||||||
|
};
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
fn get_simple_index() -> Index<U16Codec> {
|
||||||
|
let index = Index::<U16Codec>::new(4, 8);
|
||||||
|
let mut txn = index.env.write_txn().unwrap();
|
||||||
|
for i in 0..256u16 {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
bitmap.insert(i as u32);
|
||||||
|
index.insert(&mut txn, 0, &i, &bitmap);
|
||||||
|
}
|
||||||
|
txn.commit().unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
fn get_random_looking_index() -> Index<U16Codec> {
|
||||||
|
let index = Index::<U16Codec>::new(4, 8);
|
||||||
|
let mut txn = index.env.write_txn().unwrap();
|
||||||
|
|
||||||
|
let rng = fastrand::Rng::with_seed(0);
|
||||||
|
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>();
|
||||||
|
|
||||||
|
for (_i, key) in keys.into_iter().enumerate() {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
bitmap.insert(key);
|
||||||
|
bitmap.insert(key + 100);
|
||||||
|
index.insert(&mut txn, 0, &(key as u16), &bitmap);
|
||||||
|
}
|
||||||
|
txn.commit().unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn random_looking_index_snap() {
|
||||||
|
let index = get_random_looking_index();
|
||||||
|
insta::assert_display_snapshot!(index)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn filter_sort() {
|
||||||
|
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||||
|
for (i, index) in indexes.into_iter().enumerate() {
|
||||||
|
let txn = index.env.read_txn().unwrap();
|
||||||
|
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
|
||||||
|
let mut results = String::new();
|
||||||
|
let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates);
|
||||||
|
for (facet, docids) in iter {
|
||||||
|
let facet = U16Codec::bytes_decode(facet).unwrap();
|
||||||
|
results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids)));
|
||||||
|
}
|
||||||
|
insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results);
|
||||||
|
|
||||||
|
txn.commit().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
172
milli/src/search/facet/facet_sort_descending.rs
Normal file
172
milli/src/search/facet/facet_sort_descending.rs
Normal file
@ -0,0 +1,172 @@
|
|||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
use crate::heed_codec::facet::new::{
|
||||||
|
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
|
||||||
|
};
|
||||||
|
|
||||||
|
use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
|
||||||
|
|
||||||
|
fn descending_facet_sort<'t>(
|
||||||
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
candidates: RoaringBitmap,
|
||||||
|
) -> Box<dyn Iterator<Item = (&'t [u8], RoaringBitmap)> + 't> {
|
||||||
|
let highest_level = get_highest_level(rtxn, db, field_id);
|
||||||
|
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id) {
|
||||||
|
let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound };
|
||||||
|
let last_bound = get_last_facet_value::<MyByteSlice>(rtxn, db, field_id).unwrap();
|
||||||
|
let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound };
|
||||||
|
let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX);
|
||||||
|
Box::new(DescendingFacetSort {
|
||||||
|
rtxn,
|
||||||
|
db,
|
||||||
|
field_id,
|
||||||
|
stack: vec![(candidates, iter, Bound::Included(last_bound))],
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
return Box::new(std::iter::empty());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct DescendingFacetSort<'t> {
|
||||||
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
stack: Vec<(
|
||||||
|
RoaringBitmap,
|
||||||
|
std::iter::Take<heed::RoRevRange<'t, FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>>,
|
||||||
|
Bound<&'t [u8]>,
|
||||||
|
)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t> Iterator for DescendingFacetSort<'t> {
|
||||||
|
type Item = (&'t [u8], RoaringBitmap);
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
|
'outer: loop {
|
||||||
|
let (documents_ids, deepest_iter, right_bound) = self.stack.last_mut()?;
|
||||||
|
while let Some(result) = deepest_iter.next() {
|
||||||
|
let (
|
||||||
|
FacetKey { level, left_bound, field_id },
|
||||||
|
FacetGroupValue { size: group_size, mut bitmap },
|
||||||
|
) = result.unwrap();
|
||||||
|
// The range is unbounded on the right and the group size for the highest level is MAX,
|
||||||
|
// so we need to check that we are not iterating over the next field id
|
||||||
|
if field_id != self.field_id {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
// If the last iterator found an empty set of documents it means
|
||||||
|
// that we found all the documents in the sub level iterations already,
|
||||||
|
// we can pop this level iterator.
|
||||||
|
if documents_ids.is_empty() {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
bitmap &= &*documents_ids;
|
||||||
|
if !bitmap.is_empty() {
|
||||||
|
*documents_ids -= &bitmap;
|
||||||
|
|
||||||
|
if level == 0 {
|
||||||
|
return Some((left_bound, bitmap));
|
||||||
|
}
|
||||||
|
let starting_key_below = FacetKey { field_id, level: level - 1, left_bound };
|
||||||
|
|
||||||
|
let end_key_kelow = match *right_bound {
|
||||||
|
Bound::Included(right) => Bound::Included(FacetKey {
|
||||||
|
field_id,
|
||||||
|
level: level - 1,
|
||||||
|
left_bound: right,
|
||||||
|
}),
|
||||||
|
Bound::Excluded(right) => Bound::Excluded(FacetKey {
|
||||||
|
field_id,
|
||||||
|
level: level - 1,
|
||||||
|
left_bound: right,
|
||||||
|
}),
|
||||||
|
Bound::Unbounded => Bound::Unbounded,
|
||||||
|
};
|
||||||
|
let prev_right_bound = *right_bound;
|
||||||
|
*right_bound = Bound::Excluded(left_bound);
|
||||||
|
let iter = self
|
||||||
|
.db
|
||||||
|
.rev_range(
|
||||||
|
&self.rtxn,
|
||||||
|
&(Bound::Included(starting_key_below), end_key_kelow),
|
||||||
|
)
|
||||||
|
.unwrap()
|
||||||
|
.take(group_size as usize);
|
||||||
|
|
||||||
|
self.stack.push((bitmap, iter, prev_right_bound));
|
||||||
|
continue 'outer;
|
||||||
|
}
|
||||||
|
*right_bound = Bound::Excluded(left_bound);
|
||||||
|
}
|
||||||
|
self.stack.pop();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::{
|
||||||
|
codec::{MyByteSlice, U16Codec},
|
||||||
|
descending_facet_sort::descending_facet_sort,
|
||||||
|
display_bitmap, FacetKeyCodec, Index,
|
||||||
|
};
|
||||||
|
use heed::BytesDecode;
|
||||||
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
|
fn get_simple_index() -> Index<U16Codec> {
|
||||||
|
let index = Index::<U16Codec>::new(4, 8);
|
||||||
|
let mut txn = index.env.write_txn().unwrap();
|
||||||
|
for i in 0..256u16 {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
bitmap.insert(i as u32);
|
||||||
|
index.insert(&mut txn, 0, &i, &bitmap);
|
||||||
|
}
|
||||||
|
txn.commit().unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
fn get_random_looking_index() -> Index<U16Codec> {
|
||||||
|
let index = Index::<U16Codec>::new(4, 8);
|
||||||
|
let mut txn = index.env.write_txn().unwrap();
|
||||||
|
|
||||||
|
let rng = fastrand::Rng::with_seed(0);
|
||||||
|
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>();
|
||||||
|
|
||||||
|
for (_i, key) in keys.into_iter().enumerate() {
|
||||||
|
let mut bitmap = RoaringBitmap::new();
|
||||||
|
bitmap.insert(key);
|
||||||
|
bitmap.insert(key + 100);
|
||||||
|
index.insert(&mut txn, 0, &(key as u16), &bitmap);
|
||||||
|
}
|
||||||
|
txn.commit().unwrap();
|
||||||
|
index
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn random_looking_index_snap() {
|
||||||
|
let index = get_random_looking_index();
|
||||||
|
insta::assert_display_snapshot!(index)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn filter_sort_descending() {
|
||||||
|
let indexes = [get_simple_index(), get_random_looking_index()];
|
||||||
|
for (i, index) in indexes.into_iter().enumerate() {
|
||||||
|
let txn = index.env.read_txn().unwrap();
|
||||||
|
let candidates = (200..=300).into_iter().collect::<RoaringBitmap>();
|
||||||
|
let mut results = String::new();
|
||||||
|
let db = index.db.content.remap_key_type::<FacetKeyCodec<MyByteSlice>>();
|
||||||
|
let iter = descending_facet_sort(&txn, &db, 0, candidates);
|
||||||
|
for (facet, docids) in iter {
|
||||||
|
let facet = U16Codec::bytes_decode(facet).unwrap();
|
||||||
|
results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids)));
|
||||||
|
}
|
||||||
|
insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results);
|
||||||
|
|
||||||
|
txn.commit().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
@ -1,649 +0,0 @@
|
|||||||
// //! This module contains helpers iterators for facet strings.
|
|
||||||
// //!
|
|
||||||
// //! The purpose is to help iterate over the quite complex system of facets strings. A simple
|
|
||||||
// //! description of the system would be that every facet string value is stored into an LMDB database
|
|
||||||
// //! and that every value is associated with the document ids which are associated with this facet
|
|
||||||
// //! string value.
|
|
||||||
// //!
|
|
||||||
// //! In reality it is a little bit more complex as we have to create aggregations of runs of facet
|
|
||||||
// //! string values, those aggregations helps in choosing the right groups of facets to follow.
|
|
||||||
// //!
|
|
||||||
// //! ## A typical algorithm run
|
|
||||||
// //!
|
|
||||||
// //! If a group of aggregated facets values contains one of the documents ids, we must continue
|
|
||||||
// //! iterating over the sub-groups.
|
|
||||||
// //!
|
|
||||||
// //! If this group is the lowest level and contain at least one document id we yield the associated
|
|
||||||
// //! facet documents ids.
|
|
||||||
// //!
|
|
||||||
// //! If the group doesn't contain one of our documents ids, we continue to the next group at this
|
|
||||||
// //! same level.
|
|
||||||
// //!
|
|
||||||
// //! ## The complexity comes from the strings
|
|
||||||
// //!
|
|
||||||
// //! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create
|
|
||||||
// //! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the
|
|
||||||
// //! two numbers bounds, the left and the right bound of the group, both inclusive.
|
|
||||||
// //!
|
|
||||||
// //! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and
|
|
||||||
// //! puting two numbers big-endian encoded one after the other gives us ordered groups. The values
|
|
||||||
// //! are simple unions of the documents ids coming from the groups below.
|
|
||||||
// //!
|
|
||||||
// //! ### Example of what a facet number LMDB database contain
|
|
||||||
// //!
|
|
||||||
// //! | level | left-bound | right-bound | documents ids |
|
|
||||||
// //! |-------|------------|-------------|------------------|
|
|
||||||
// //! | 0 | 0 | _skipped_ | 1, 2 |
|
|
||||||
// //! | 0 | 1 | _skipped_ | 6, 7 |
|
|
||||||
// //! | 0 | 3 | _skipped_ | 4, 7 |
|
|
||||||
// //! | 0 | 5 | _skipped_ | 2, 3, 4 |
|
|
||||||
// //! | 1 | 0 | 1 | 1, 2, 6, 7 |
|
|
||||||
// //! | 1 | 3 | 5 | 2, 3, 4, 7 |
|
|
||||||
// //! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 |
|
|
||||||
// //!
|
|
||||||
// //! As you can see the level 0 have two equal bounds, therefore we skip serializing the second
|
|
||||||
// //! bound, that's the base level where you can directly fetch the documents ids associated with an
|
|
||||||
// //! exact number.
|
|
||||||
// //!
|
|
||||||
// //! The next levels have two different bounds and the associated documents ids are simply the result
|
|
||||||
// //! of an union of all the documents ids associated with the aggregated groups above.
|
|
||||||
// //!
|
|
||||||
// //! ## The complexity of defining groups for facet strings
|
|
||||||
// //!
|
|
||||||
// //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in
|
|
||||||
// //! lexicographical order, it means that whatever the key represent the bytes are read in their raw
|
|
||||||
// //! form and a simple `strcmp` will define the order in which keys will be read from the store.
|
|
||||||
// //!
|
|
||||||
// //! That's easy for types with a known size, like floats or integers, they are 64 bytes long and
|
|
||||||
// //! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the
|
|
||||||
// //! first number then by the second if the the first number is equal on two keys.
|
|
||||||
// //!
|
|
||||||
// //! For strings it is a lot more complex as those types are unsized, it means that the size of facet
|
|
||||||
// //! strings is different for each facet value.
|
|
||||||
// //!
|
|
||||||
// //! ### Basic approach: padding the keys
|
|
||||||
// //!
|
|
||||||
// //! A first approach would be to simply define the maximum size of a facet string and pad the keys
|
|
||||||
// //! with zeroes. The big problem of this approach is that it:
|
|
||||||
// //! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the
|
|
||||||
// //! other.
|
|
||||||
// //! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB
|
|
||||||
// //! performances.
|
|
||||||
// //!
|
|
||||||
// //! ### Better approach: number the facet groups
|
|
||||||
// //!
|
|
||||||
// //! A better approach would be to number the groups, this way we don't have the downsides of the
|
|
||||||
// //! previously described approach but we need to be able to describe the groups by using a number.
|
|
||||||
// //!
|
|
||||||
// //! #### Example of facet strings with numbered groups
|
|
||||||
// //!
|
|
||||||
// //! | level | left-bound | right-bound | left-string | right-string | documents ids |
|
|
||||||
// //! |-------|------------|-------------|-------------|--------------|------------------|
|
|
||||||
// //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 |
|
|
||||||
// //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 |
|
|
||||||
// //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 |
|
|
||||||
// //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 |
|
|
||||||
// //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 |
|
|
||||||
// //! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 |
|
|
||||||
// //! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 |
|
|
||||||
// //!
|
|
||||||
// //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not
|
|
||||||
// //! need to store the facet string value two times.
|
|
||||||
// //!
|
|
||||||
// //! The number in the left-bound and right-bound columns are incremental numbers representing the
|
|
||||||
// //! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering
|
|
||||||
// //! of the LMDB keys.
|
|
||||||
// //!
|
|
||||||
// //! In the value, not in the key, you can see that we added two new values: the left-string and the
|
|
||||||
// //! right-string, which defines the original facet strings associated with the given group.
|
|
||||||
// //!
|
|
||||||
// //! We put those two strings inside of the value, this way we do not limit the maximum size of the
|
|
||||||
// //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big
|
|
||||||
// //! values on another page, this helps in iterating over keys fast enough and only fetch the page
|
|
||||||
// //! with the values when required.
|
|
||||||
// //!
|
|
||||||
// //! The other little advantage with this solution is that there is no a big overhead, compared with
|
|
||||||
// //! the facet number levels, we only duplicate the facet strings once for the level 1.
|
|
||||||
// //!
|
|
||||||
// //! #### A typical algorithm run
|
|
||||||
// //!
|
|
||||||
// //! Note that the algorithm is always moving from the highest level to the lowest one, one level
|
|
||||||
// //! by one level, this is why it is ok to only store the facets string on the level 1.
|
|
||||||
// //!
|
|
||||||
// //! If a group of aggregated facets values, a group with numbers contains one of the documents ids,
|
|
||||||
// //! we must continue iterating over the sub-groups. To do so:
|
|
||||||
// //! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds
|
|
||||||
// //! and iterate over the facet groups defined by these numbers over the current level - 1.
|
|
||||||
// //! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the
|
|
||||||
// //! value and just do the same as with the facet numbers but with strings: iterate over the
|
|
||||||
// //! current level - 1 with both keys.
|
|
||||||
// //!
|
|
||||||
// //! If this group is the lowest level (level 0) and contain at least one document id we yield the
|
|
||||||
// //! associated facet documents ids.
|
|
||||||
// //!
|
|
||||||
// //! If the group doesn't contain one of our documents ids, we continue to the next group at this
|
|
||||||
// //! same level.
|
|
||||||
// //!
|
|
||||||
|
|
||||||
// use std::num::NonZeroU8;
|
|
||||||
// use std::ops::Bound;
|
|
||||||
// use std::ops::Bound::{Excluded, Included, Unbounded};
|
|
||||||
|
|
||||||
// use either::{Either, Left, Right};
|
|
||||||
// use heed::types::{ByteSlice, DecodeIgnore};
|
|
||||||
// use heed::{Database, LazyDecode, RoRange, RoRevRange};
|
|
||||||
// use roaring::RoaringBitmap;
|
|
||||||
|
|
||||||
// use crate::heed_codec::facet::FacetStringZeroBoundsValueCodec;
|
|
||||||
// use crate::heed_codec::CboRoaringBitmapCodec;
|
|
||||||
// use crate::{FieldId, Index};
|
|
||||||
|
|
||||||
// /// An iterator that is used to explore the facets level strings
|
|
||||||
// /// from the level 1 to infinity.
|
|
||||||
// ///
|
|
||||||
// /// It yields the level, group id that an entry covers, the optional group strings
|
|
||||||
// /// that it covers of the level 0 only if it is an entry from the level 1 and
|
|
||||||
// /// the roaring bitmap associated.
|
|
||||||
// pub struct FacetStringGroupRange<'t> {
|
|
||||||
// iter: RoRange<
|
|
||||||
// 't,
|
|
||||||
// FacetLevelValueU32Codec,
|
|
||||||
// LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
|
|
||||||
// >,
|
|
||||||
// end: Bound<u32>,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> FacetStringGroupRange<'t> {
|
|
||||||
// pub fn new<X, Y>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// db: Database<X, Y>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// level: NonZeroU8,
|
|
||||||
// left: Bound<u32>,
|
|
||||||
// right: Bound<u32>,
|
|
||||||
// ) -> heed::Result<FacetStringGroupRange<'t>> {
|
|
||||||
// let db = db.remap_types::<
|
|
||||||
// FacetLevelValueU32Codec,
|
|
||||||
// FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
|
|
||||||
// >();
|
|
||||||
// let left_bound = match left {
|
|
||||||
// Included(left) => Included((field_id, level, left, u32::MIN)),
|
|
||||||
// Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
|
|
||||||
// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
|
|
||||||
// };
|
|
||||||
// let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
|
|
||||||
// let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?;
|
|
||||||
// Ok(FacetStringGroupRange { iter, end: right })
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> Iterator for FacetStringGroupRange<'t> {
|
|
||||||
// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// match self.iter.next() {
|
|
||||||
// Some(Ok(((_fid, level, left, right), docids))) => {
|
|
||||||
// let must_be_returned = match self.end {
|
|
||||||
// Included(end) => right <= end,
|
|
||||||
// Excluded(end) => right < end,
|
|
||||||
// Unbounded => true,
|
|
||||||
// };
|
|
||||||
// if must_be_returned {
|
|
||||||
// match docids.decode() {
|
|
||||||
// Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))),
|
|
||||||
// Err(e) => Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// } else {
|
|
||||||
// None
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// Some(Err(e)) => Some(Err(e)),
|
|
||||||
// None => None,
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub struct FacetStringGroupRevRange<'t> {
|
|
||||||
// iter: RoRevRange<
|
|
||||||
// 't,
|
|
||||||
// FacetLevelValueU32Codec,
|
|
||||||
// LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
|
|
||||||
// >,
|
|
||||||
// end: Bound<u32>,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> FacetStringGroupRevRange<'t> {
|
|
||||||
// pub fn new<X, Y>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// db: Database<X, Y>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// level: NonZeroU8,
|
|
||||||
// left: Bound<u32>,
|
|
||||||
// right: Bound<u32>,
|
|
||||||
// ) -> heed::Result<FacetStringGroupRevRange<'t>> {
|
|
||||||
// let db = db.remap_types::<
|
|
||||||
// FacetLevelValueU32Codec,
|
|
||||||
// FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
|
|
||||||
// >();
|
|
||||||
// let left_bound = match left {
|
|
||||||
// Included(left) => Included((field_id, level, left, u32::MIN)),
|
|
||||||
// Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
|
|
||||||
// Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
|
|
||||||
// };
|
|
||||||
// let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
|
|
||||||
// let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?;
|
|
||||||
// Ok(FacetStringGroupRevRange { iter, end: right })
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> Iterator for FacetStringGroupRevRange<'t> {
|
|
||||||
// type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// loop {
|
|
||||||
// match self.iter.next() {
|
|
||||||
// Some(Ok(((_fid, level, left, right), docids))) => {
|
|
||||||
// let must_be_returned = match self.end {
|
|
||||||
// Included(end) => right <= end,
|
|
||||||
// Excluded(end) => right < end,
|
|
||||||
// Unbounded => true,
|
|
||||||
// };
|
|
||||||
// if must_be_returned {
|
|
||||||
// match docids.decode() {
|
|
||||||
// Ok((bounds, docids)) => {
|
|
||||||
// return Some(Ok(((level, left, right), (bounds, docids))))
|
|
||||||
// }
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// continue;
|
|
||||||
// }
|
|
||||||
// Some(Err(e)) => return Some(Err(e)),
|
|
||||||
// None => return None,
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// /// An iterator that is used to explore the level 0 of the facets string database.
|
|
||||||
// ///
|
|
||||||
// /// It yields the facet string and the roaring bitmap associated with it.
|
|
||||||
// pub struct FacetStringLevelZeroRange<'t> {
|
|
||||||
// iter: RoRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> FacetStringLevelZeroRange<'t> {
|
|
||||||
// pub fn new<X, Y>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// db: Database<X, Y>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// left: Bound<&str>,
|
|
||||||
// right: Bound<&str>,
|
|
||||||
// ) -> heed::Result<FacetStringLevelZeroRange<'t>> {
|
|
||||||
// fn encode_value<'a>(buffer: &'a mut Vec<u8>, field_id: FieldId, value: &str) -> &'a [u8] {
|
|
||||||
// buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
// buffer.push(0);
|
|
||||||
// buffer.extend_from_slice(value.as_bytes());
|
|
||||||
// &buffer[..]
|
|
||||||
// }
|
|
||||||
|
|
||||||
// let mut left_buffer = Vec::new();
|
|
||||||
// let left_bound = match left {
|
|
||||||
// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)),
|
|
||||||
// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)),
|
|
||||||
// Unbounded => {
|
|
||||||
// left_buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
// left_buffer.push(0);
|
|
||||||
// Included(&left_buffer[..])
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
|
|
||||||
// let mut right_buffer = Vec::new();
|
|
||||||
// let right_bound = match right {
|
|
||||||
// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)),
|
|
||||||
// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)),
|
|
||||||
// Unbounded => {
|
|
||||||
// right_buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
// right_buffer.push(1); // we must only get the level 0
|
|
||||||
// Excluded(&right_buffer[..])
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
|
|
||||||
// let iter = db
|
|
||||||
// .remap_key_type::<ByteSlice>()
|
|
||||||
// .range(rtxn, &(left_bound, right_bound))?
|
|
||||||
// .remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>();
|
|
||||||
|
|
||||||
// Ok(FacetStringLevelZeroRange { iter })
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
|
|
||||||
// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// match self.iter.next() {
|
|
||||||
// Some(Ok(((_fid, normalized), (original, docids)))) => {
|
|
||||||
// Some(Ok((normalized, original, docids)))
|
|
||||||
// }
|
|
||||||
// Some(Err(e)) => Some(Err(e)),
|
|
||||||
// None => None,
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub struct FacetStringLevelZeroRevRange<'t> {
|
|
||||||
// iter: RoRevRange<'t, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> FacetStringLevelZeroRevRange<'t> {
|
|
||||||
// pub fn new<X, Y>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// db: Database<X, Y>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// left: Bound<&str>,
|
|
||||||
// right: Bound<&str>,
|
|
||||||
// ) -> heed::Result<FacetStringLevelZeroRevRange<'t>> {
|
|
||||||
// fn encode_value<'a>(buffer: &'a mut Vec<u8>, field_id: FieldId, value: &str) -> &'a [u8] {
|
|
||||||
// buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
// buffer.push(0);
|
|
||||||
// buffer.extend_from_slice(value.as_bytes());
|
|
||||||
// &buffer[..]
|
|
||||||
// }
|
|
||||||
|
|
||||||
// let mut left_buffer = Vec::new();
|
|
||||||
// let left_bound = match left {
|
|
||||||
// Included(value) => Included(encode_value(&mut left_buffer, field_id, value)),
|
|
||||||
// Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)),
|
|
||||||
// Unbounded => {
|
|
||||||
// left_buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
// left_buffer.push(0);
|
|
||||||
// Included(&left_buffer[..])
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
|
|
||||||
// let mut right_buffer = Vec::new();
|
|
||||||
// let right_bound = match right {
|
|
||||||
// Included(value) => Included(encode_value(&mut right_buffer, field_id, value)),
|
|
||||||
// Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)),
|
|
||||||
// Unbounded => {
|
|
||||||
// right_buffer.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
// right_buffer.push(1); // we must only get the level 0
|
|
||||||
// Excluded(&right_buffer[..])
|
|
||||||
// }
|
|
||||||
// };
|
|
||||||
|
|
||||||
// let iter = db
|
|
||||||
// .remap_key_type::<ByteSlice>()
|
|
||||||
// .rev_range(rtxn, &(left_bound, right_bound))?
|
|
||||||
// .remap_types::<FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec>();
|
|
||||||
|
|
||||||
// Ok(FacetStringLevelZeroRevRange { iter })
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> Iterator for FacetStringLevelZeroRevRange<'t> {
|
|
||||||
// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// match self.iter.next() {
|
|
||||||
// Some(Ok(((_fid, normalized), (original, docids)))) => {
|
|
||||||
// Some(Ok((normalized, original, docids)))
|
|
||||||
// }
|
|
||||||
// Some(Err(e)) => Some(Err(e)),
|
|
||||||
// None => None,
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// type EitherStringRange<'t> = Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>;
|
|
||||||
// type EitherStringRevRange<'t> =
|
|
||||||
// Either<FacetStringGroupRevRange<'t>, FacetStringLevelZeroRevRange<'t>>;
|
|
||||||
|
|
||||||
// /// An iterator that is used to explore the facet strings level by level,
|
|
||||||
// /// it will only return facets strings that are associated with the
|
|
||||||
// /// candidates documents ids given.
|
|
||||||
// pub struct FacetStringIter<'t> {
|
|
||||||
// rtxn: &'t heed::RoTxn<'t>,
|
|
||||||
// db: Database<ByteSlice, ByteSlice>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// level_iters: Vec<(RoaringBitmap, Either<EitherStringRange<'t>, EitherStringRevRange<'t>>)>,
|
|
||||||
// must_reduce: bool,
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> FacetStringIter<'t> {
|
|
||||||
// pub fn new_reducing(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// index: &'t Index,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// documents_ids: RoaringBitmap,
|
|
||||||
// ) -> heed::Result<FacetStringIter<'t>> {
|
|
||||||
// let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
|
|
||||||
// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?;
|
|
||||||
// Ok(FacetStringIter {
|
|
||||||
// rtxn,
|
|
||||||
// db,
|
|
||||||
// field_id,
|
|
||||||
// level_iters: vec![(documents_ids, Left(highest_iter))],
|
|
||||||
// must_reduce: true,
|
|
||||||
// })
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub fn new_reverse_reducing(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// index: &'t Index,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// documents_ids: RoaringBitmap,
|
|
||||||
// ) -> heed::Result<FacetStringIter<'t>> {
|
|
||||||
// let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
|
|
||||||
// let highest_reverse_iter = Self::highest_reverse_iter(rtxn, index, db, field_id)?;
|
|
||||||
// Ok(FacetStringIter {
|
|
||||||
// rtxn,
|
|
||||||
// db,
|
|
||||||
// field_id,
|
|
||||||
// level_iters: vec![(documents_ids, Right(highest_reverse_iter))],
|
|
||||||
// must_reduce: true,
|
|
||||||
// })
|
|
||||||
// }
|
|
||||||
|
|
||||||
// pub fn new_non_reducing(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// index: &'t Index,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// documents_ids: RoaringBitmap,
|
|
||||||
// ) -> heed::Result<FacetStringIter<'t>> {
|
|
||||||
// let db = index.facet_id_string_docids.remap_types::<ByteSlice, ByteSlice>();
|
|
||||||
// let highest_iter = Self::highest_iter(rtxn, index, db, field_id)?;
|
|
||||||
// Ok(FacetStringIter {
|
|
||||||
// rtxn,
|
|
||||||
// db,
|
|
||||||
// field_id,
|
|
||||||
// level_iters: vec![(documents_ids, Left(highest_iter))],
|
|
||||||
// must_reduce: false,
|
|
||||||
// })
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn highest_level<X, Y>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// db: Database<X, Y>,
|
|
||||||
// fid: FieldId,
|
|
||||||
// ) -> heed::Result<Option<u8>> {
|
|
||||||
// Ok(db
|
|
||||||
// .remap_types::<ByteSlice, DecodeIgnore>()
|
|
||||||
// .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits
|
|
||||||
// .last()
|
|
||||||
// .transpose()?
|
|
||||||
// .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn highest_iter<X, Y>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// index: &'t Index,
|
|
||||||
// db: Database<X, Y>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// ) -> heed::Result<Either<FacetStringGroupRange<'t>, FacetStringLevelZeroRange<'t>>> {
|
|
||||||
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
|
||||||
// match NonZeroU8::new(highest_level) {
|
|
||||||
// Some(highest_level) => FacetStringGroupRange::new(
|
|
||||||
// rtxn,
|
|
||||||
// index.facet_id_string_docids,
|
|
||||||
// field_id,
|
|
||||||
// highest_level,
|
|
||||||
// Unbounded,
|
|
||||||
// Unbounded,
|
|
||||||
// )
|
|
||||||
// .map(Left),
|
|
||||||
// None => FacetStringLevelZeroRange::new(
|
|
||||||
// rtxn,
|
|
||||||
// index.facet_id_string_docids,
|
|
||||||
// field_id,
|
|
||||||
// Unbounded,
|
|
||||||
// Unbounded,
|
|
||||||
// )
|
|
||||||
// .map(Right),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// fn highest_reverse_iter<X, Y>(
|
|
||||||
// rtxn: &'t heed::RoTxn,
|
|
||||||
// index: &'t Index,
|
|
||||||
// db: Database<X, Y>,
|
|
||||||
// field_id: FieldId,
|
|
||||||
// ) -> heed::Result<Either<FacetStringGroupRevRange<'t>, FacetStringLevelZeroRevRange<'t>>> {
|
|
||||||
// let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0);
|
|
||||||
// match NonZeroU8::new(highest_level) {
|
|
||||||
// Some(highest_level) => FacetStringGroupRevRange::new(
|
|
||||||
// rtxn,
|
|
||||||
// index.facet_id_string_docids,
|
|
||||||
// field_id,
|
|
||||||
// highest_level,
|
|
||||||
// Unbounded,
|
|
||||||
// Unbounded,
|
|
||||||
// )
|
|
||||||
// .map(Left),
|
|
||||||
// None => FacetStringLevelZeroRevRange::new(
|
|
||||||
// rtxn,
|
|
||||||
// index.facet_id_string_docids,
|
|
||||||
// field_id,
|
|
||||||
// Unbounded,
|
|
||||||
// Unbounded,
|
|
||||||
// )
|
|
||||||
// .map(Right),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// impl<'t> Iterator for FacetStringIter<'t> {
|
|
||||||
// type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>;
|
|
||||||
|
|
||||||
// fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
// 'outer: loop {
|
|
||||||
// let (documents_ids, last) = self.level_iters.last_mut()?;
|
|
||||||
// let is_ascending = last.is_left();
|
|
||||||
|
|
||||||
// // We remap the different iterator types to make
|
|
||||||
// // the algorithm less complex to understand.
|
|
||||||
// let last = match last {
|
|
||||||
// Left(ascending) => match ascending {
|
|
||||||
// Left(group) => Left(Left(group)),
|
|
||||||
// Right(zero_level) => Right(Left(zero_level)),
|
|
||||||
// },
|
|
||||||
// Right(descending) => match descending {
|
|
||||||
// Left(group) => Left(Right(group)),
|
|
||||||
// Right(zero_level) => Right(Right(zero_level)),
|
|
||||||
// },
|
|
||||||
// };
|
|
||||||
|
|
||||||
// match last {
|
|
||||||
// Left(group) => {
|
|
||||||
// for result in group {
|
|
||||||
// match result {
|
|
||||||
// Ok(((level, left, right), (string_bounds, mut docids))) => {
|
|
||||||
// docids &= &*documents_ids;
|
|
||||||
// if !docids.is_empty() {
|
|
||||||
// if self.must_reduce {
|
|
||||||
// *documents_ids -= &docids;
|
|
||||||
// }
|
|
||||||
|
|
||||||
// let result = if is_ascending {
|
|
||||||
// match string_bounds {
|
|
||||||
// Some((left, right)) => FacetStringLevelZeroRange::new(
|
|
||||||
// self.rtxn,
|
|
||||||
// self.db,
|
|
||||||
// self.field_id,
|
|
||||||
// Included(left),
|
|
||||||
// Included(right),
|
|
||||||
// )
|
|
||||||
// .map(Right),
|
|
||||||
// None => FacetStringGroupRange::new(
|
|
||||||
// self.rtxn,
|
|
||||||
// self.db,
|
|
||||||
// self.field_id,
|
|
||||||
// NonZeroU8::new(level.get() - 1).unwrap(),
|
|
||||||
// Included(left),
|
|
||||||
// Included(right),
|
|
||||||
// )
|
|
||||||
// .map(Left),
|
|
||||||
// }
|
|
||||||
// .map(Left)
|
|
||||||
// } else {
|
|
||||||
// match string_bounds {
|
|
||||||
// Some((left, right)) => {
|
|
||||||
// FacetStringLevelZeroRevRange::new(
|
|
||||||
// self.rtxn,
|
|
||||||
// self.db,
|
|
||||||
// self.field_id,
|
|
||||||
// Included(left),
|
|
||||||
// Included(right),
|
|
||||||
// )
|
|
||||||
// .map(Right)
|
|
||||||
// }
|
|
||||||
// None => FacetStringGroupRevRange::new(
|
|
||||||
// self.rtxn,
|
|
||||||
// self.db,
|
|
||||||
// self.field_id,
|
|
||||||
// NonZeroU8::new(level.get() - 1).unwrap(),
|
|
||||||
// Included(left),
|
|
||||||
// Included(right),
|
|
||||||
// )
|
|
||||||
// .map(Left),
|
|
||||||
// }
|
|
||||||
// .map(Right)
|
|
||||||
// };
|
|
||||||
|
|
||||||
// match result {
|
|
||||||
// Ok(iter) => {
|
|
||||||
// self.level_iters.push((docids, iter));
|
|
||||||
// continue 'outer;
|
|
||||||
// }
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// Right(zero_level) => {
|
|
||||||
// // level zero only
|
|
||||||
// for result in zero_level {
|
|
||||||
// match result {
|
|
||||||
// Ok((normalized, original, mut docids)) => {
|
|
||||||
// docids &= &*documents_ids;
|
|
||||||
// if !docids.is_empty() {
|
|
||||||
// if self.must_reduce {
|
|
||||||
// *documents_ids -= &docids;
|
|
||||||
// }
|
|
||||||
// return Some(Ok((normalized, original, docids)));
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// Err(e) => return Some(Err(e)),
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
|
|
||||||
// self.level_iters.pop();
|
|
||||||
// }
|
|
||||||
// }
|
|
||||||
// }
|
|
@ -1,9 +1,79 @@
|
|||||||
|
use heed::types::ByteSlice;
|
||||||
|
use heed::{BytesDecode, RoTxn};
|
||||||
|
|
||||||
|
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice};
|
||||||
|
|
||||||
pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
|
pub use self::facet_distribution::{FacetDistribution, DEFAULT_VALUES_PER_FACET};
|
||||||
// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange};
|
// pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange};
|
||||||
// pub use self::facet_string::FacetStringIter;
|
// pub use self::facet_string::FacetStringIter;
|
||||||
pub use self::filter::Filter;
|
pub use self::filter::Filter;
|
||||||
|
|
||||||
mod facet_distribution;
|
mod facet_distribution;
|
||||||
mod facet_number;
|
mod facet_distribution_iter;
|
||||||
mod facet_string;
|
mod facet_sort_ascending;
|
||||||
|
mod facet_sort_descending;
|
||||||
mod filter;
|
mod filter;
|
||||||
|
|
||||||
|
fn get_first_facet_value<'t, BoundCodec>(
|
||||||
|
txn: &'t RoTxn,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
) -> Option<BoundCodec::DItem>
|
||||||
|
where
|
||||||
|
BoundCodec: BytesDecode<'t>,
|
||||||
|
{
|
||||||
|
let mut level0prefix = vec![];
|
||||||
|
level0prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
level0prefix.push(0);
|
||||||
|
let mut level0_iter_forward = db
|
||||||
|
.as_polymorph()
|
||||||
|
.prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())
|
||||||
|
.unwrap();
|
||||||
|
if let Some(first) = level0_iter_forward.next() {
|
||||||
|
let (first_key, _) = first.unwrap();
|
||||||
|
let first_key = FacetKeyCodec::<BoundCodec>::bytes_decode(first_key).unwrap();
|
||||||
|
Some(first_key.left_bound)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn get_last_facet_value<'t, BoundCodec>(
|
||||||
|
txn: &'t RoTxn,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
) -> Option<BoundCodec::DItem>
|
||||||
|
where
|
||||||
|
BoundCodec: BytesDecode<'t>,
|
||||||
|
{
|
||||||
|
let mut level0prefix = vec![];
|
||||||
|
level0prefix.extend_from_slice(&field_id.to_be_bytes());
|
||||||
|
level0prefix.push(0);
|
||||||
|
let mut level0_iter_backward = db
|
||||||
|
.as_polymorph()
|
||||||
|
.rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())
|
||||||
|
.unwrap();
|
||||||
|
if let Some(last) = level0_iter_backward.next() {
|
||||||
|
let (last_key, _) = last.unwrap();
|
||||||
|
let last_key = FacetKeyCodec::<BoundCodec>::bytes_decode(last_key).unwrap();
|
||||||
|
Some(last_key.left_bound)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
fn get_highest_level<'t>(
|
||||||
|
txn: &'t RoTxn<'t>,
|
||||||
|
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
|
||||||
|
field_id: u16,
|
||||||
|
) -> u8 {
|
||||||
|
let field_id_prefix = &field_id.to_be_bytes();
|
||||||
|
db.as_polymorph()
|
||||||
|
.rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix)
|
||||||
|
.unwrap()
|
||||||
|
.next()
|
||||||
|
.map(|el| {
|
||||||
|
let (key, _) = el.unwrap();
|
||||||
|
let key = FacetKeyCodec::<MyByteSlice>::bytes_decode(key).unwrap();
|
||||||
|
key.level
|
||||||
|
})
|
||||||
|
.unwrap_or(0)
|
||||||
|
}
|
||||||
|
@ -64,7 +64,7 @@ impl<'i> Facets<'i> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[logging_timer::time("Facets::{}")]
|
#[logging_timer::time("Facets::{}")]
|
||||||
pub fn execute(mut self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
pub fn execute(self, wtxn: &mut heed::RwTxn) -> Result<()> {
|
||||||
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
self.index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
|
||||||
// We get the faceted fields to be able to create the facet levels.
|
// We get the faceted fields to be able to create the facet levels.
|
||||||
let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone();
|
let faceted_fields = self.index.faceted_fields_ids(wtxn)?.clone();
|
||||||
@ -172,14 +172,14 @@ impl<'t> CreateFacetsAlgo<'t> {
|
|||||||
bitmaps.push(docids);
|
bitmaps.push(docids);
|
||||||
|
|
||||||
if bitmaps.len() == self.level_group_size {
|
if bitmaps.len() == self.level_group_size {
|
||||||
handle_group(&bitmaps, left_bound);
|
handle_group(&bitmaps, left_bound)?;
|
||||||
first_iteration_for_new_group = true;
|
first_iteration_for_new_group = true;
|
||||||
bitmaps.clear();
|
bitmaps.clear();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// don't forget to give the leftover bitmaps as well
|
// don't forget to give the leftover bitmaps as well
|
||||||
if !bitmaps.is_empty() {
|
if !bitmaps.is_empty() {
|
||||||
handle_group(&bitmaps, left_bound);
|
handle_group(&bitmaps, left_bound)?;
|
||||||
bitmaps.clear();
|
bitmaps.clear();
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -197,7 +197,7 @@ impl<'t> CreateFacetsAlgo<'t> {
|
|||||||
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
handle_group: &mut dyn FnMut(&[RoaringBitmap], &'t [u8]) -> Result<()>,
|
||||||
) -> Result<Vec<grenad::Reader<File>>> {
|
) -> Result<Vec<grenad::Reader<File>>> {
|
||||||
if level == 0 {
|
if level == 0 {
|
||||||
self.read_level_0(handle_group);
|
self.read_level_0(handle_group)?;
|
||||||
// Level 0 is already in the database
|
// Level 0 is already in the database
|
||||||
return Ok(vec![]);
|
return Ok(vec![]);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user