Add range search and incremental indexing algorithm

This commit is contained in:
Loïc Lecrenier 2022-08-30 15:22:39 +02:00 committed by Loïc Lecrenier
parent 63ef0aba18
commit b8a1caad5e
8 changed files with 1145 additions and 115 deletions

View File

@ -54,7 +54,7 @@ big_s = "1.0.2"
insta = "1.21.0" insta = "1.21.0"
maplit = "1.0.2" maplit = "1.0.2"
md5 = "0.7.0" md5 = "0.7.0"
rand = "0.8.5" rand = {version = "0.8.5", features = ["small_rng"] }
[features] [features]
default = [ "charabia/default" ] default = [ "charabia/default" ]

View File

@ -1,8 +1,8 @@
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice};
use crate::Result;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use std::ops::ControlFlow; use std::ops::ControlFlow;
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice};
use super::{get_first_facet_value, get_highest_level}; use super::{get_first_facet_value, get_highest_level};
pub fn iterate_over_facet_distribution<'t, CB>( pub fn iterate_over_facet_distribution<'t, CB>(
@ -11,18 +11,19 @@ pub fn iterate_over_facet_distribution<'t, CB>(
field_id: u16, field_id: u16,
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
callback: CB, callback: CB,
) where ) -> Result<()>
where
CB: FnMut(&'t [u8], u64) -> ControlFlow<()>, CB: FnMut(&'t [u8], u64) -> ControlFlow<()>,
{ {
let mut fd = FacetDistribution { rtxn, db, field_id, callback }; let mut fd = FacetDistribution { rtxn, db, field_id, callback };
let highest_level = let highest_level =
get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id); get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id)?;
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id) { if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX); fd.iterate(candidates, highest_level, first_bound, usize::MAX);
return; return Ok(());
} else { } else {
return; return Ok(());
} }
} }
@ -45,26 +46,26 @@ where
candidates: &RoaringBitmap, candidates: &RoaringBitmap,
starting_bound: &'t [u8], starting_bound: &'t [u8],
group_size: usize, group_size: usize,
) -> ControlFlow<()> { ) -> Result<ControlFlow<()>> {
let starting_key = let starting_key =
FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound }; FacetKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
let iter = self.db.range(self.rtxn, &(starting_key..)).unwrap().take(group_size); let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size);
for el in iter { for el in iter {
let (key, value) = el.unwrap(); let (key, value) = el?;
// The range is unbounded on the right and the group size for the highest level is MAX, // The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id // so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id { if key.field_id != self.field_id {
return ControlFlow::Break(()); return Ok(ControlFlow::Break(()));
} }
let docids_in_common = value.bitmap.intersection_len(candidates); let docids_in_common = value.bitmap.intersection_len(candidates);
if docids_in_common > 0 { if docids_in_common > 0 {
match (self.callback)(key.left_bound, docids_in_common) { match (self.callback)(key.left_bound, docids_in_common) {
ControlFlow::Continue(_) => {} ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return ControlFlow::Break(()), ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
} }
} }
} }
return ControlFlow::Continue(()); return Ok(ControlFlow::Continue(()));
} }
fn iterate( fn iterate(
&mut self, &mut self,
@ -72,7 +73,7 @@ where
level: u8, level: u8,
starting_bound: &'t [u8], starting_bound: &'t [u8],
group_size: usize, group_size: usize,
) -> ControlFlow<()> { ) -> Result<ControlFlow<()>> {
if level == 0 { if level == 0 {
return self.iterate_level_0(candidates, starting_bound, group_size); return self.iterate_level_0(candidates, starting_bound, group_size);
} }
@ -84,34 +85,42 @@ where
// The range is unbounded on the right and the group size for the highest level is MAX, // The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id // so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id { if key.field_id != self.field_id {
return ControlFlow::Break(()); return Ok(ControlFlow::Break(()));
} }
let docids_in_common = value.bitmap & candidates; let docids_in_common = value.bitmap & candidates;
if docids_in_common.len() > 0 { if docids_in_common.len() > 0 {
let cf = let cf = self.iterate(
self.iterate(&docids_in_common, level - 1, key.left_bound, value.size as usize); &docids_in_common,
level - 1,
key.left_bound,
value.size as usize,
)?;
match cf { match cf {
ControlFlow::Continue(_) => {} ControlFlow::Continue(_) => {}
ControlFlow::Break(_) => return ControlFlow::Break(()), ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
} }
} }
} }
return ControlFlow::Continue(()); return Ok(ControlFlow::Continue(()));
} }
} }
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::{codec::U16Codec, Index};
use heed::BytesDecode; use heed::BytesDecode;
use rand::{rngs::SmallRng, Rng, SeedableRng};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use std::ops::ControlFlow; use std::ops::ControlFlow;
use crate::{
heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec, search::facet::test::FacetIndex,
};
use super::iterate_over_facet_distribution; use super::iterate_over_facet_distribution;
fn get_simple_index() -> Index<U16Codec> { fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
let index = Index::<U16Codec>::new(4, 8); let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap(); let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 { for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
@ -121,18 +130,19 @@ mod tests {
txn.commit().unwrap(); txn.commit().unwrap();
index index
} }
fn get_random_looking_index() -> Index<U16Codec> { fn get_random_looking_index() -> FacetIndex<OrderedF64Codec> {
let index = Index::<U16Codec>::new(4, 8); let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap(); let mut txn = index.env.write_txn().unwrap();
let rng = fastrand::Rng::with_seed(0); let rng = rand::rngs::SmallRng::from_seed([0; 32]);
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>(); let keys =
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
for (_i, key) in keys.into_iter().enumerate() { for (_i, key) in keys.into_iter().enumerate() {
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
bitmap.insert(key); bitmap.insert(key);
bitmap.insert(key + 100); bitmap.insert(key + 100.);
index.insert(&mut txn, 0, &(key as u16), &bitmap); index.insert(&mut txn, 0, &(key as f64), &bitmap);
} }
txn.commit().unwrap(); txn.commit().unwrap();
index index
@ -156,7 +166,7 @@ mod tests {
0, 0,
&candidates, &candidates,
|facet, count| { |facet, count| {
let facet = U16Codec::bytes_decode(facet).unwrap(); let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {count}\n")); results.push_str(&format!("{facet}: {count}\n"));
ControlFlow::Continue(()) ControlFlow::Continue(())
}, },
@ -180,7 +190,7 @@ mod tests {
0, 0,
&candidates, &candidates,
|facet, count| { |facet, count| {
let facet = U16Codec::bytes_decode(facet).unwrap(); let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
if nbr_facets == 100 { if nbr_facets == 100 {
return ControlFlow::Break(()); return ControlFlow::Break(());
} else { } else {

View File

@ -0,0 +1,451 @@
use heed::BytesEncode;
use roaring::RoaringBitmap;
use std::ops::Bound;
use std::ops::RangeBounds;
use crate::heed_codec::facet::new::FacetGroupValueCodec;
use crate::heed_codec::facet::new::FacetKey;
use crate::heed_codec::facet::new::FacetKeyCodec;
use crate::heed_codec::facet::new::MyByteSlice;
use crate::Result;
use super::get_first_facet_value;
use super::get_highest_level;
use super::get_last_facet_value;
pub fn find_docids_of_facet_within_bounds<'t, BoundCodec>(
rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
left: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
right: &'t Bound<<BoundCodec as BytesEncode<'t>>::EItem>,
) -> Result<RoaringBitmap>
where
BoundCodec: for<'a> BytesEncode<'a>,
for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized,
{
let inner;
let left = match left {
Bound::Included(left) => {
inner = BoundCodec::bytes_encode(left).unwrap();
Bound::Included(inner.as_ref())
}
Bound::Excluded(left) => {
inner = BoundCodec::bytes_encode(left).unwrap();
Bound::Excluded(inner.as_ref())
}
Bound::Unbounded => Bound::Unbounded,
};
let inner;
let right = match right {
Bound::Included(right) => {
inner = BoundCodec::bytes_encode(right).unwrap();
Bound::Included(inner.as_ref())
}
Bound::Excluded(right) => {
inner = BoundCodec::bytes_encode(right).unwrap();
Bound::Excluded(inner.as_ref())
}
Bound::Unbounded => Bound::Unbounded,
};
let mut docids = RoaringBitmap::new();
let mut f = FacetRangeSearch { rtxn, db, field_id, left, right, docids: &mut docids };
let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id)? {
let last_bound = get_last_facet_value::<MyByteSlice>(rtxn, db, field_id)?.unwrap();
f.run(highest_level, first_bound, Bound::Included(last_bound), usize::MAX)?;
Ok(docids)
} else {
return Ok(RoaringBitmap::new());
}
}
/// Fetch the document ids that have a facet with a value between the two given bounds
struct FacetRangeSearch<'t, 'b, 'bitmap> {
rtxn: &'t heed::RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
left: Bound<&'b [u8]>,
right: Bound<&'b [u8]>,
docids: &'bitmap mut RoaringBitmap,
}
impl<'t, 'b, 'bitmap> FacetRangeSearch<'t, 'b, 'bitmap> {
fn run_level_0(&mut self, starting_left_bound: &'t [u8], group_size: usize) -> Result<()> {
let left_key =
FacetKey { field_id: self.field_id, level: 0, left_bound: starting_left_bound };
let iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size);
for el in iter {
let (key, value) = el?;
// the right side of the iter range is unbounded, so we need to make sure that we are not iterating
// on the next field id
if key.field_id != self.field_id {
return Ok(());
}
let should_skip = {
match self.left {
Bound::Included(left) => left > key.left_bound,
Bound::Excluded(left) => left >= key.left_bound,
Bound::Unbounded => false,
}
};
if should_skip {
continue;
}
let should_stop = {
match self.right {
Bound::Included(right) => right < key.left_bound,
Bound::Excluded(right) => right <= key.left_bound,
Bound::Unbounded => false,
}
};
if should_stop {
break;
}
if RangeBounds::<&[u8]>::contains(&(self.left, self.right), &key.left_bound) {
*self.docids |= value.bitmap;
}
}
Ok(())
}
/// Recursive part of the algorithm for level > 0
fn run(
&mut self,
level: u8,
starting_left_bound: &'t [u8],
rightmost_bound: Bound<&'t [u8]>,
group_size: usize,
) -> Result<()> {
if level == 0 {
return self.run_level_0(starting_left_bound, group_size);
}
let left_key = FacetKey { field_id: self.field_id, level, left_bound: starting_left_bound };
let mut iter = self.db.range(&self.rtxn, &(left_key..))?.take(group_size);
let (mut previous_key, mut previous_value) = iter.next().unwrap()?;
for el in iter {
let (next_key, next_value) = el?;
// the right of the iter range is unbounded, so we need to make sure that we are not iterating
// on the next field id
if next_key.field_id != self.field_id {
return Ok(());
}
// now, do we skip, stop, or visit?
let should_skip = {
match self.left {
Bound::Included(left) => left >= next_key.left_bound,
Bound::Excluded(left) => left >= next_key.left_bound, // TODO: use > instead?
Bound::Unbounded => false,
}
};
if should_skip {
previous_key = next_key;
previous_value = next_value;
continue;
}
// should we stop?
let should_stop = {
match self.right {
Bound::Included(right) => right < previous_key.left_bound,
Bound::Excluded(right) => right <= previous_key.left_bound,
Bound::Unbounded => false,
}
};
if should_stop {
return Ok(());
}
// should we take the whole thing, without recursing down?
let should_take_whole_group = {
let left_condition = match self.left {
Bound::Included(left) => previous_key.left_bound >= left,
Bound::Excluded(left) => previous_key.left_bound > left,
Bound::Unbounded => true,
};
let right_condition = match self.right {
Bound::Included(right) => next_key.left_bound <= right,
Bound::Excluded(right) => next_key.left_bound <= right,
Bound::Unbounded => true,
};
left_condition && right_condition
};
if should_take_whole_group {
*self.docids |= &previous_value.bitmap;
previous_key = next_key;
previous_value = next_value;
continue;
}
let level = level - 1;
let starting_left_bound = previous_key.left_bound;
let rightmost_bound = Bound::Excluded(next_key.left_bound);
let group_size = previous_value.size as usize;
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
previous_key = next_key;
previous_value = next_value;
}
// previous_key/previous_value are the last element
// now, do we skip, stop, or visit?
let should_skip = {
match (self.left, rightmost_bound) {
(Bound::Included(left), Bound::Included(right)) => left > right,
(Bound::Included(left), Bound::Excluded(right)) => left >= right,
(Bound::Excluded(left), Bound::Included(right) | Bound::Excluded(right)) => {
left >= right
}
(Bound::Unbounded, _) => false,
(_, Bound::Unbounded) => false, // should never run?
}
};
if should_skip {
return Ok(());
}
// should we stop?
let should_stop = {
match self.right {
Bound::Included(right) => right <= previous_key.left_bound,
Bound::Excluded(right) => right < previous_key.left_bound,
Bound::Unbounded => false,
}
};
if should_stop {
return Ok(());
}
// should we take the whole thing, without recursing down?
let should_take_whole_group = {
let left_condition = match self.left {
Bound::Included(left) => previous_key.left_bound >= left,
Bound::Excluded(left) => previous_key.left_bound > left,
Bound::Unbounded => true,
};
let right_condition = match (self.right, rightmost_bound) {
(Bound::Included(right), Bound::Included(rightmost)) => rightmost <= right,
(Bound::Included(right), Bound::Excluded(rightmost)) => rightmost < right,
// e.g. x < 8 and rightmost is <= y
// condition met if rightmost < 8
(Bound::Excluded(right), Bound::Included(rightmost)) => rightmost < right,
// e.g. x < 8 and rightmost is < y
// condition met only if y <= 8?
(Bound::Excluded(right), Bound::Excluded(rightmost)) => rightmost <= right,
// e.g. x < inf. , so yes we take the whole thing
(Bound::Unbounded, _) => true,
// e.g. x < 7 , righmost is inf
(_, Bound::Unbounded) => false, // panic?
};
left_condition && right_condition
};
if should_take_whole_group {
*self.docids |= &previous_value.bitmap;
} else {
let level = level - 1;
let starting_left_bound = previous_key.left_bound;
let group_size = previous_value.size as usize;
self.run(level, starting_left_bound, rightmost_bound, group_size)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use crate::{
heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec,
search::facet::test::FacetIndex, snapshot_tests::display_bitmap,
};
use rand::{Rng, SeedableRng};
use roaring::RoaringBitmap;
use std::ops::Bound;
use super::find_docids_of_facet_within_bounds;
fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(i as u32);
index.insert(&mut txn, 0, &(i as f64), &bitmap);
}
txn.commit().unwrap();
index
}
fn get_random_looking_index() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap();
let rng = rand::rngs::SmallRng::from_seed([0; 32]);
let keys =
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
for (_i, key) in keys.into_iter().enumerate() {
let mut bitmap = RoaringBitmap::new();
bitmap.insert(key);
bitmap.insert(key + 100);
index.insert(&mut txn, 0, &(key as f64), &bitmap);
}
txn.commit().unwrap();
index
}
#[test]
fn random_looking_index_snap() {
let index = get_random_looking_index();
insta::assert_display_snapshot!(index)
}
#[test]
fn filter_range_increasing() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.into_iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results = String::new();
for i in 0..=255 {
let i = i as f64;
let start = Bound::Included(0.);
let end = Bound::Included(i);
let docids = find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
&index.db.content,
0,
&start,
&end,
)
.unwrap();
results.push_str(&format!("{}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(
format!("filter_range_{i}_increasing_included_bounds"),
results
);
let mut results = String::new();
for i in 0..=255 {
let i = i as f64;
let start = Bound::Excluded(0.);
let end = Bound::Excluded(i);
let docids = find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
&index.db.content,
0,
&start,
&end,
)
.unwrap();
results.push_str(&format!("{}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(
format!("filter_range_{i}_increasing_excluded_bounds"),
results
);
txn.commit().unwrap();
}
}
#[test]
fn filter_range_decreasing() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.into_iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results = String::new();
for i in (0..=255).into_iter().rev() {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Included(255.);
let docids = find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
&index.db.content,
0,
&start,
&end,
)
.unwrap();
results.push_str(&format!("{}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(
format!("filter_range_{i}_decreasing_included_bounds"),
results
);
let mut results = String::new();
for i in (0..=255).into_iter().rev() {
let i = i as f64;
let start = Bound::Excluded(i);
let end = Bound::Excluded(255.);
let docids = find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
&index.db.content,
0,
&start,
&end,
)
.unwrap();
results.push_str(&format!("{}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(
format!("filter_range_{i}_decreasing_excluded_bounds"),
results
);
txn.commit().unwrap();
}
}
#[test]
fn filter_range_pinch() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.into_iter().enumerate() {
let txn = index.env.read_txn().unwrap();
let mut results = String::new();
for i in (0..=128).into_iter().rev() {
let i = i as f64;
let start = Bound::Included(i);
let end = Bound::Included(255. - i);
let docids = find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
&index.db.content,
0,
&start,
&end,
)
.unwrap();
results.push_str(&format!("{}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(format!("filter_range_{i}_pinch_included_bounds"), results);
let mut results = String::new();
for i in (0..=128).into_iter().rev() {
let i = i as f64;
let start = Bound::Excluded(i);
let end = Bound::Excluded(255. - i);
let docids = find_docids_of_facet_within_bounds::<OrderedF64Codec>(
&txn,
&index.db.content,
0,
&start,
&end,
)
.unwrap();
results.push_str(&format!("{}\n", display_bitmap(&docids)));
}
insta::assert_snapshot!(format!("filter_range_{i}_pinch_excluded_bounds"), results);
txn.commit().unwrap();
}
}
}

View File

@ -1,8 +1,8 @@
use roaring::RoaringBitmap;
use crate::heed_codec::facet::new::{ use crate::heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
}; };
use crate::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level}; use super::{get_first_facet_value, get_highest_level};
@ -11,20 +11,20 @@ pub fn ascending_facet_sort<'t>(
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> Box<dyn Iterator<Item = (&'t [u8], RoaringBitmap)> + 't> { ) -> Result<Box<dyn Iterator<Item = Result<(&'t [u8], RoaringBitmap)>> + 't>> {
let highest_level = let highest_level =
get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id); get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id)?;
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>( if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(
rtxn, rtxn,
&db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
field_id, field_id,
) { )? {
let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound };
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX); let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] }) Ok(Box::new(AscendingFacetSort { rtxn, db, field_id, stack: vec![(candidates, iter)] }))
} else { } else {
return Box::new(std::iter::empty()); Ok(Box::new(std::iter::empty()))
} }
} }
@ -39,7 +39,7 @@ struct AscendingFacetSort<'t, 'e> {
} }
impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> { impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
type Item = (&'t [u8], RoaringBitmap); type Item = Result<(&'t [u8], RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
'outer: loop { 'outer: loop {
@ -67,15 +67,15 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
*documents_ids -= &bitmap; *documents_ids -= &bitmap;
if level == 0 { if level == 0 {
return Some((left_bound, bitmap)); return Some(Ok((left_bound, bitmap)));
} }
let starting_key_below = let starting_key_below =
FacetKey { field_id: self.field_id, level: level - 1, left_bound }; FacetKey { field_id: self.field_id, level: level - 1, left_bound };
let iter = self let iter = match self.db.range(&self.rtxn, &(starting_key_below..)) {
.db Ok(iter) => iter,
.range(&self.rtxn, &(starting_key_below..)) Err(e) => return Some(Err(e.into())),
.unwrap() }
.take(group_size as usize); .take(group_size as usize);
self.stack.push((bitmap, iter)); self.stack.push((bitmap, iter));
continue 'outer; continue 'outer;
@ -88,14 +88,19 @@ impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::{
ascending_facet_sort::ascending_facet_sort, codec::U16Codec, display_bitmap, Index,
};
use heed::BytesDecode; use heed::BytesDecode;
use rand::Rng;
use rand::SeedableRng;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
fn get_simple_index() -> Index<U16Codec> { use crate::{
let index = Index::<U16Codec>::new(4, 8); heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec,
search::facet::{facet_sort_ascending::ascending_facet_sort, test::FacetIndex},
snapshot_tests::display_bitmap,
};
fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap(); let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 { for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
@ -105,18 +110,19 @@ mod tests {
txn.commit().unwrap(); txn.commit().unwrap();
index index
} }
fn get_random_looking_index() -> Index<U16Codec> { fn get_random_looking_index() -> FacetIndex<OrderedF64Codec> {
let index = Index::<U16Codec>::new(4, 8); let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap(); let mut txn = index.env.write_txn().unwrap();
let rng = fastrand::Rng::with_seed(0); let rng = rand::rngs::SmallRng::from_seed([0; 32]);
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>(); let keys =
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
for (_i, key) in keys.into_iter().enumerate() { for (_i, key) in keys.into_iter().enumerate() {
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
bitmap.insert(key); bitmap.insert(key);
bitmap.insert(key + 100); bitmap.insert(key + 100);
index.insert(&mut txn, 0, &(key as u16), &bitmap); index.insert(&mut txn, 0, &(key as f64), &bitmap);
} }
txn.commit().unwrap(); txn.commit().unwrap();
index index
@ -136,7 +142,7 @@ mod tests {
let mut results = String::new(); let mut results = String::new();
let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates); let iter = ascending_facet_sort(&txn, &index.db.content, 0, candidates);
for (facet, docids) in iter { for (facet, docids) in iter {
let facet = U16Codec::bytes_decode(facet).unwrap(); let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids)));
} }
insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results); insta::assert_snapshot!(format!("filter_sort_{i}_ascending"), results);

View File

@ -1,10 +1,10 @@
use std::ops::Bound; use std::ops::Bound;
use roaring::RoaringBitmap;
use crate::heed_codec::facet::new::{ use crate::heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice, FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
}; };
use crate::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level, get_last_facet_value}; use super::{get_first_facet_value, get_highest_level, get_last_facet_value};
@ -13,21 +13,21 @@ fn descending_facet_sort<'t>(
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> Box<dyn Iterator<Item = (&'t [u8], RoaringBitmap)> + 't> { ) -> Result<Box<dyn Iterator<Item = Result<(&'t [u8], RoaringBitmap)>> + 't>> {
let highest_level = get_highest_level(rtxn, db, field_id); let highest_level = get_highest_level(rtxn, db, field_id)?;
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id) { if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id)? {
let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound }; let first_key = FacetKey { field_id, level: highest_level, left_bound: first_bound };
let last_bound = get_last_facet_value::<MyByteSlice>(rtxn, db, field_id).unwrap(); let last_bound = get_last_facet_value::<MyByteSlice>(rtxn, db, field_id)?.unwrap();
let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound }; let last_key = FacetKey { field_id, level: highest_level, left_bound: last_bound };
let iter = db.rev_range(rtxn, &(first_key..=last_key)).unwrap().take(usize::MAX); let iter = db.rev_range(rtxn, &(first_key..=last_key))?.take(usize::MAX);
Box::new(DescendingFacetSort { Ok(Box::new(DescendingFacetSort {
rtxn, rtxn,
db, db,
field_id, field_id,
stack: vec![(candidates, iter, Bound::Included(last_bound))], stack: vec![(candidates, iter, Bound::Included(last_bound))],
}) }))
} else { } else {
return Box::new(std::iter::empty()); Ok(Box::new(std::iter::empty()))
} }
} }
@ -43,7 +43,7 @@ struct DescendingFacetSort<'t> {
} }
impl<'t> Iterator for DescendingFacetSort<'t> { impl<'t> Iterator for DescendingFacetSort<'t> {
type Item = (&'t [u8], RoaringBitmap); type Item = Result<(&'t [u8], RoaringBitmap)>;
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
'outer: loop { 'outer: loop {
@ -70,7 +70,7 @@ impl<'t> Iterator for DescendingFacetSort<'t> {
*documents_ids -= &bitmap; *documents_ids -= &bitmap;
if level == 0 { if level == 0 {
return Some((left_bound, bitmap)); return Some(Ok((left_bound, bitmap)));
} }
let starting_key_below = FacetKey { field_id, level: level - 1, left_bound }; let starting_key_below = FacetKey { field_id, level: level - 1, left_bound };
@ -89,14 +89,14 @@ impl<'t> Iterator for DescendingFacetSort<'t> {
}; };
let prev_right_bound = *right_bound; let prev_right_bound = *right_bound;
*right_bound = Bound::Excluded(left_bound); *right_bound = Bound::Excluded(left_bound);
let iter = self let iter = match self.db.rev_range(
.db &self.rtxn,
.rev_range( &(Bound::Included(starting_key_below), end_key_kelow),
&self.rtxn, ) {
&(Bound::Included(starting_key_below), end_key_kelow), Ok(iter) => iter,
) Err(e) => return Some(Err(e.into())),
.unwrap() }
.take(group_size as usize); .take(group_size as usize);
self.stack.push((bitmap, iter, prev_right_bound)); self.stack.push((bitmap, iter, prev_right_bound));
continue 'outer; continue 'outer;
@ -110,16 +110,20 @@ impl<'t> Iterator for DescendingFacetSort<'t> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::{
codec::{MyByteSlice, U16Codec},
descending_facet_sort::descending_facet_sort,
display_bitmap, FacetKeyCodec, Index,
};
use heed::BytesDecode; use heed::BytesDecode;
use rand::Rng;
use rand::SeedableRng;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
fn get_simple_index() -> Index<U16Codec> { use crate::{
let index = Index::<U16Codec>::new(4, 8); heed_codec::facet::new::{ordered_f64_codec::OrderedF64Codec, FacetKeyCodec, MyByteSlice},
search::facet::{facet_sort_descending::descending_facet_sort, test::FacetIndex},
snapshot_tests::display_bitmap,
};
fn get_simple_index() -> FacetIndex<OrderedF64Codec> {
let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap(); let mut txn = index.env.write_txn().unwrap();
for i in 0..256u16 { for i in 0..256u16 {
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
@ -129,18 +133,19 @@ mod tests {
txn.commit().unwrap(); txn.commit().unwrap();
index index
} }
fn get_random_looking_index() -> Index<U16Codec> { fn get_random_looking_index() -> FacetIndex<OrderedF64Codec> {
let index = Index::<U16Codec>::new(4, 8); let index = FacetIndex::<OrderedF64Codec>::new(4, 8);
let mut txn = index.env.write_txn().unwrap(); let mut txn = index.env.write_txn().unwrap();
let rng = fastrand::Rng::with_seed(0); let rng = rand::rngs::SmallRng::from_seed([0; 32]);
let keys = std::iter::from_fn(|| Some(rng.u32(..256))).take(128).collect::<Vec<u32>>(); let keys =
std::iter::from_fn(|| Some(rng.gen_range(0..256))).take(128).collect::<Vec<u32>>();
for (_i, key) in keys.into_iter().enumerate() { for (_i, key) in keys.into_iter().enumerate() {
let mut bitmap = RoaringBitmap::new(); let mut bitmap = RoaringBitmap::new();
bitmap.insert(key); bitmap.insert(key);
bitmap.insert(key + 100); bitmap.insert(key + 100.);
index.insert(&mut txn, 0, &(key as u16), &bitmap); index.insert(&mut txn, 0, &(key as f64), &bitmap);
} }
txn.commit().unwrap(); txn.commit().unwrap();
index index
@ -161,7 +166,7 @@ mod tests {
let db = index.db.content.remap_key_type::<FacetKeyCodec<MyByteSlice>>(); let db = index.db.content.remap_key_type::<FacetKeyCodec<MyByteSlice>>();
let iter = descending_facet_sort(&txn, &db, 0, candidates); let iter = descending_facet_sort(&txn, &db, 0, candidates);
for (facet, docids) in iter { for (facet, docids) in iter {
let facet = U16Codec::bytes_decode(facet).unwrap(); let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids))); results.push_str(&format!("{facet}: {}\n", display_bitmap(&docids)));
} }
insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results); insta::assert_snapshot!(format!("filter_sort_{i}_descending"), results);

View File

@ -7,7 +7,6 @@ use either::Either;
pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token}; pub use filter_parser::{Condition, Error as FPError, FilterCondition, Span, Token};
use heed::types::DecodeIgnore; use heed::types::DecodeIgnore;
use heed::LazyDecode; use heed::LazyDecode;
use log::debug;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
// use super::FacetNumberRange; // use super::FacetNumberRange;

View File

@ -0,0 +1,459 @@
use crate::heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
};
use crate::Result;
use heed::Error;
use heed::{types::ByteSlice, BytesDecode, RoTxn, RwTxn};
use roaring::RoaringBitmap;
use super::get_highest_level;
enum InsertionResult {
InPlace,
Insert,
}
enum DeletionResult {
InPlace,
Reduce { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
Remove { prev: Option<Vec<u8>>, next: Option<Vec<u8>> },
}
struct IncrementalFacetUpdate<'i> {
db: &'i heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
group_size: usize,
min_level_size: usize,
max_group_size: usize,
}
impl<'i> IncrementalFacetUpdate<'i> {
fn find_insertion_key_value<'a>(
&self,
field_id: u16,
level: u8,
search_key: &[u8],
txn: &RoTxn,
) -> Result<(FacetKey<Vec<u8>>, FacetGroupValue)> {
let mut prefix = vec![];
prefix.extend_from_slice(&field_id.to_be_bytes());
prefix.push(level);
prefix.extend_from_slice(search_key);
let mut prefix_iter = self
.db
.as_polymorph()
.prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(txn, &prefix.as_slice())?;
if let Some(e) = prefix_iter.next() {
let (key_bytes, value) = e?;
let key = FacetKeyCodec::<MyByteSlice>::bytes_decode(&key_bytes)
.ok_or(heed::Error::Encoding)?;
Ok((
FacetKeyCodec::<MyByteSlice>::bytes_decode(&key_bytes)
.ok_or(Error::Encoding)?
.into_owned(),
value,
))
} else {
let key = FacetKey { field_id, level, left_bound: search_key };
match self.db.get_lower_than(txn, &key)? {
Some((key, value)) => {
if key.level != level || key.field_id != field_id {
let mut prefix = vec![];
prefix.extend_from_slice(&field_id.to_be_bytes());
prefix.push(level);
let mut iter = self
.db
.as_polymorph()
.prefix_iter::<_, MyByteSlice, FacetGroupValueCodec>(
txn,
&prefix.as_slice(),
)?;
let (key_bytes, value) = iter.next().unwrap()?;
Ok((
FacetKeyCodec::<MyByteSlice>::bytes_decode(&key_bytes)
.ok_or(Error::Encoding)?
.into_owned(),
value,
))
} else {
Ok((key.into_owned(), value))
}
}
None => panic!(),
}
}
}
fn insert_in_level_0<'t>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
new_key: &[u8],
new_values: &RoaringBitmap,
) -> Result<InsertionResult> {
let key = FacetKey { field_id, level: 0, left_bound: new_key };
let value = FacetGroupValue { bitmap: new_values.clone(), size: 1 };
let mut level0_prefix = vec![];
level0_prefix.extend_from_slice(&field_id.to_be_bytes());
level0_prefix.push(0);
let mut iter = self
.db
.as_polymorph()
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &level0_prefix)?;
if iter.next().is_none() {
drop(iter);
self.db.put(txn, &key, &value)?;
return Ok(InsertionResult::Insert);
} else {
drop(iter);
let old_value = self.db.get(&txn, &key)?;
match old_value {
Some(mut updated_value) => {
// now merge the two
updated_value.bitmap |= value.bitmap;
self.db.put(txn, &key, &updated_value)?;
Ok(InsertionResult::InPlace)
}
None => {
self.db.put(txn, &key, &value)?;
Ok(InsertionResult::Insert)
}
}
}
}
fn insert_in_level<'t>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
level: u8,
new_key: &[u8],
new_values: &RoaringBitmap,
) -> Result<InsertionResult> {
if level == 0 {
return self.insert_in_level_0(txn, field_id, new_key, new_values);
}
let max_group_size = self.max_group_size;
let (insertion_key, insertion_value) =
self.find_insertion_key_value(field_id, level, new_key, txn)?;
let result = self.insert_in_level(txn, field_id, level - 1, new_key.clone(), new_values)?;
// level below inserted an element
let insertion_key = {
let mut new_insertion_key = insertion_key.clone();
let mut modified = false;
if new_key < insertion_key.left_bound.as_slice() {
new_insertion_key.left_bound = new_key.to_vec();
modified = true;
}
if modified {
let is_deleted = self.db.delete(txn, &insertion_key.as_ref())?;
assert!(is_deleted);
self.db.put(txn, &new_insertion_key.as_ref(), &insertion_value)?;
}
new_insertion_key
};
match result {
// TODO: this could go above the block recomputing insertion key
// because we know that if we inserted in place, the key is not a new one
// thus it doesn't extend a group
InsertionResult::InPlace => {
let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
updated_value.bitmap |= new_values;
self.db.put(txn, &insertion_key.as_ref(), &updated_value)?;
return Ok(InsertionResult::InPlace);
}
InsertionResult::Insert => {}
}
let mut updated_value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
updated_value.size += 1;
if updated_value.size as usize == max_group_size {
// need to split it
// recompute left element and right element
// replace current group by left element
// add one more group to the right
let size_left = max_group_size / 2;
let size_right = max_group_size - size_left;
let level_below = level - 1;
let (start_key, _) = self
.db
.get_greater_than_or_equal_to(
&txn,
&FacetKey {
field_id,
level: level_below,
left_bound: insertion_key.left_bound.as_slice(),
},
)?
.unwrap();
let mut iter = self.db.range(&txn, &(start_key..))?.take(max_group_size);
let group_left = {
let mut values_left = RoaringBitmap::new();
let mut i = 0;
while let Some(next) = iter.next() {
let (_key, value) = next?;
i += 1;
values_left |= &value.bitmap;
if i == size_left {
break;
}
}
let key =
FacetKey { field_id, level, left_bound: insertion_key.left_bound.clone() };
let value = FacetGroupValue { size: size_left as u8, bitmap: values_left };
(key, value)
};
let group_right = {
let mut values_right = RoaringBitmap::new();
let mut right_start_key = None;
while let Some(next) = iter.next() {
let (key, value) = next?;
if right_start_key.is_none() {
right_start_key = Some(key.left_bound);
}
values_right |= &value.bitmap;
}
let key =
FacetKey { field_id, level, left_bound: right_start_key.unwrap().to_vec() };
let value = FacetGroupValue { size: size_right as u8, bitmap: values_right };
(key, value)
};
drop(iter);
let _ = self.db.delete(txn, &insertion_key.as_ref())?;
self.db.put(txn, &group_left.0.as_ref(), &group_left.1)?;
self.db.put(txn, &group_right.0.as_ref(), &group_right.1)?;
Ok(InsertionResult::Insert)
} else {
let mut value = self.db.get(&txn, &insertion_key.as_ref())?.unwrap();
value.bitmap |= new_values;
value.size += 1;
self.db.put(txn, &insertion_key.as_ref(), &value).unwrap();
Ok(InsertionResult::InPlace)
}
}
pub fn insert<'a, 't>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
new_key: &[u8],
new_values: &RoaringBitmap,
) -> Result<()> {
if new_values.is_empty() {
return Ok(());
}
let group_size = self.group_size;
let highest_level = get_highest_level(&txn, &self.db, field_id)?;
let result =
self.insert_in_level(txn, field_id, highest_level as u8, new_key, new_values)?;
match result {
InsertionResult::InPlace => return Ok(()),
InsertionResult::Insert => {}
}
let mut highest_level_prefix = vec![];
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
highest_level_prefix.push(highest_level);
let size_highest_level = self
.db
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)?
.count();
if size_highest_level < self.min_level_size {
return Ok(());
}
let mut groups_iter = self
.db
.as_polymorph()
.prefix_iter::<_, ByteSlice, FacetGroupValueCodec>(&txn, &highest_level_prefix)?;
let mut to_add = vec![];
for _ in 0..group_size {
let mut first_key = None;
let mut values = RoaringBitmap::new();
for _ in 0..group_size {
let (key_bytes, value_i) = groups_iter.next().unwrap()?;
let key_i = FacetKeyCodec::<MyByteSlice>::bytes_decode(&key_bytes)
.ok_or(Error::Encoding)?;
if first_key.is_none() {
first_key = Some(key_i);
}
values |= value_i.bitmap;
}
let key = FacetKey {
field_id,
level: highest_level + 1,
left_bound: first_key.unwrap().left_bound,
};
let value = FacetGroupValue { size: group_size as u8, bitmap: values };
to_add.push((key.into_owned(), value));
}
drop(groups_iter);
for (key, value) in to_add {
self.db.put(txn, &key.as_ref(), &value)?;
}
Ok(())
}
fn delete_in_level<'t>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
level: u8,
key: &[u8],
value: u32,
) -> Result<DeletionResult> {
if level == 0 {
return self.delete_in_level_0(txn, field_id, key, value);
}
let (deletion_key, mut bitmap) =
self.find_insertion_key_value(field_id, level, key, txn)?;
let result = self.delete_in_level(txn, field_id, level - 1, key.clone(), value)?;
let mut decrease_size = false;
let (prev_key, next_key) = match result {
DeletionResult::InPlace => {
bitmap.bitmap.remove(value);
self.db.put(txn, &deletion_key.as_ref(), &bitmap)?;
return Ok(DeletionResult::InPlace);
}
DeletionResult::Reduce { prev, next } => (prev, next),
DeletionResult::Remove { prev, next } => {
decrease_size = true;
(prev, next)
}
};
let mut updated_value = bitmap;
if decrease_size {
updated_value.size -= 1;
}
if updated_value.size == 0 {
self.db.delete(txn, &deletion_key.as_ref())?;
Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
} else {
let mut updated_deletion_key = deletion_key.clone();
if key == deletion_key.left_bound {
updated_deletion_key.left_bound = next_key.clone().unwrap();
}
updated_value.bitmap.remove(value);
let _ = self.db.delete(txn, &deletion_key.as_ref())?;
self.db.put(txn, &updated_deletion_key.as_ref(), &updated_value)?;
Ok(DeletionResult::Reduce { prev: prev_key, next: next_key })
}
}
fn delete_in_level_0<'t>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
key: &[u8],
value: u32,
) -> Result<DeletionResult> {
let key = FacetKey { field_id, level: 0, left_bound: key };
let mut bitmap = self.db.get(&txn, &key)?.unwrap().bitmap;
bitmap.remove(value);
if bitmap.is_empty() {
let mut prev_key = None;
let mut next_key = None;
if let Some(prev) = self.db.get_lower_than(&txn, &key)? {
prev_key = Some(prev.0.left_bound.to_vec());
}
if let Some(next) = self.db.get_greater_than(&txn, &key)? {
if next.0.level == 0 {
next_key = Some(next.0.left_bound.to_vec());
}
}
self.db.delete(txn, &key)?;
Ok(DeletionResult::Remove { prev: prev_key, next: next_key })
} else {
self.db.put(txn, &key, &FacetGroupValue { size: 1, bitmap })?;
Ok(DeletionResult::InPlace)
}
}
pub fn delete<'a, 't>(
&self,
txn: &'t mut RwTxn,
field_id: u16,
key: &[u8],
value: u32,
) -> Result<()> {
if self.db.get(txn, &FacetKey { field_id, level: 0, left_bound: key })?.is_none() {
return Ok(());
}
let highest_level = get_highest_level(&txn, &self.db, field_id)?;
// let key_bytes = BoundCodec::bytes_encode(&key).unwrap();
let result = self.delete_in_level(txn, field_id, highest_level as u8, key, value)?;
match result {
DeletionResult::InPlace => return Ok(()),
DeletionResult::Reduce { .. } => {}
DeletionResult::Remove { .. } => {}
}
let mut highest_level_prefix = vec![];
highest_level_prefix.extend_from_slice(&field_id.to_be_bytes());
highest_level_prefix.push(highest_level);
if highest_level == 0
|| self
.db
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(&txn, &highest_level_prefix)?
.count()
>= self.group_size
{
return Ok(());
}
let mut to_delete = vec![];
let mut iter = self
.db
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(txn, &highest_level_prefix)?;
while let Some(el) = iter.next() {
let (k, _) = el?;
to_delete.push(
FacetKeyCodec::<MyByteSlice>::bytes_decode(k).ok_or(Error::Encoding)?.into_owned(),
);
}
drop(iter);
for k in to_delete {
self.db.delete(txn, &k.as_ref())?;
}
Ok(())
}
}

View File

@ -10,38 +10,39 @@ pub use self::filter::Filter;
mod facet_distribution; mod facet_distribution;
mod facet_distribution_iter; mod facet_distribution_iter;
mod facet_range_search;
mod facet_sort_ascending; mod facet_sort_ascending;
mod facet_sort_descending; mod facet_sort_descending;
mod filter; mod filter;
mod incremental_update;
fn get_first_facet_value<'t, BoundCodec>( pub(crate) fn get_first_facet_value<'t, BoundCodec>(
txn: &'t RoTxn, txn: &'t RoTxn,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
) -> Option<BoundCodec::DItem> ) -> crate::Result<Option<BoundCodec::DItem>>
where where
BoundCodec: BytesDecode<'t>, BoundCodec: BytesDecode<'t>,
{ {
let mut level0prefix = vec![]; let mut level0prefix = vec![];
level0prefix.extend_from_slice(&field_id.to_be_bytes()); level0prefix.extend_from_slice(&field_id.to_be_bytes());
level0prefix.push(0); level0prefix.push(0);
let mut level0_iter_forward = db let mut level0_iter_forward =
.as_polymorph() db.as_polymorph().prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?;
.prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())
.unwrap();
if let Some(first) = level0_iter_forward.next() { if let Some(first) = level0_iter_forward.next() {
let (first_key, _) = first.unwrap(); let (first_key, _) = first?;
let first_key = FacetKeyCodec::<BoundCodec>::bytes_decode(first_key).unwrap(); let first_key =
Some(first_key.left_bound) FacetKeyCodec::<BoundCodec>::bytes_decode(first_key).ok_or(heed::Error::Encoding)?;
Ok(Some(first_key.left_bound))
} else { } else {
None Ok(None)
} }
} }
fn get_last_facet_value<'t, BoundCodec>( pub(crate) fn get_last_facet_value<'t, BoundCodec>(
txn: &'t RoTxn, txn: &'t RoTxn,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
) -> Option<BoundCodec::DItem> ) -> crate::Result<Option<BoundCodec::DItem>>
where where
BoundCodec: BytesDecode<'t>, BoundCodec: BytesDecode<'t>,
{ {
@ -50,30 +51,129 @@ where
level0prefix.push(0); level0prefix.push(0);
let mut level0_iter_backward = db let mut level0_iter_backward = db
.as_polymorph() .as_polymorph()
.rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice()) .rev_prefix_iter::<_, ByteSlice, ByteSlice>(txn, level0prefix.as_slice())?;
.unwrap();
if let Some(last) = level0_iter_backward.next() { if let Some(last) = level0_iter_backward.next() {
let (last_key, _) = last.unwrap(); let (last_key, _) = last?;
let last_key = FacetKeyCodec::<BoundCodec>::bytes_decode(last_key).unwrap(); let last_key =
Some(last_key.left_bound) FacetKeyCodec::<BoundCodec>::bytes_decode(last_key).ok_or(heed::Error::Encoding)?;
Ok(Some(last_key.left_bound))
} else { } else {
None Ok(None)
} }
} }
fn get_highest_level<'t>( pub(crate) fn get_highest_level<'t>(
txn: &'t RoTxn<'t>, txn: &'t RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>, db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16, field_id: u16,
) -> u8 { ) -> crate::Result<u8> {
let field_id_prefix = &field_id.to_be_bytes(); let field_id_prefix = &field_id.to_be_bytes();
db.as_polymorph() Ok(db
.rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix) .as_polymorph()
.unwrap() .rev_prefix_iter::<_, ByteSlice, ByteSlice>(&txn, field_id_prefix)?
.next() .next()
.map(|el| { .map(|el| {
let (key, _) = el.unwrap(); let (key, _) = el.unwrap();
let key = FacetKeyCodec::<MyByteSlice>::bytes_decode(key).unwrap(); let key = FacetKeyCodec::<MyByteSlice>::bytes_decode(key).unwrap();
key.level key.level
}) })
.unwrap_or(0) .unwrap_or(0))
}
#[cfg(test)]
mod test {
use std::{fmt::Display, marker::PhantomData, rc::Rc};
use heed::{BytesDecode, BytesEncode, Env};
use tempfile::TempDir;
use crate::{
heed_codec::facet::new::{
FacetGroupValue, FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice,
},
snapshot_tests::display_bitmap,
};
pub struct FacetIndex<BoundCodec>
where
for<'a> BoundCodec:
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
{
pub env: Env,
pub db: Database,
_phantom: PhantomData<BoundCodec>,
}
pub struct Database {
pub content: heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
pub group_size: usize,
pub max_group_size: usize,
_tempdir: Rc<tempfile::TempDir>,
}
impl<BoundCodec> FacetIndex<BoundCodec>
where
for<'a> BoundCodec:
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
{
pub fn open_from_tempdir(
tempdir: Rc<TempDir>,
group_size: u8,
max_group_size: u8,
) -> FacetIndex<BoundCodec> {
let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize;
let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize);
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 10 * 100);
unsafe {
options.flag(heed::flags::Flags::MdbAlwaysFreePages);
}
let env = options.open(tempdir.path()).unwrap();
let content = env.open_database(None).unwrap().unwrap();
FacetIndex {
db: Database { content, group_size, max_group_size, _tempdir: tempdir },
env,
_phantom: PhantomData,
}
}
pub fn new(group_size: u8, max_group_size: u8) -> FacetIndex<BoundCodec> {
let group_size = std::cmp::min(127, std::cmp::max(group_size, 2)) as usize;
let max_group_size = std::cmp::max(group_size * 2, max_group_size as usize);
let mut options = heed::EnvOpenOptions::new();
let options = options.map_size(4096 * 4 * 100);
let tempdir = tempfile::TempDir::new_in("databases/").unwrap();
let env = options.open(tempdir.path()).unwrap();
let content = env.create_database(None).unwrap();
FacetIndex {
db: Database { content, group_size, max_group_size, _tempdir: Rc::new(tempdir) },
env,
_phantom: PhantomData,
}
}
}
impl<BoundCodec> Display for FacetIndex<BoundCodec>
where
for<'a> <BoundCodec as BytesEncode<'a>>::EItem: Sized + Display,
for<'a> BoundCodec:
BytesEncode<'a> + BytesDecode<'a, DItem = <BoundCodec as BytesEncode<'a>>::EItem>,
{
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let txn = self.env.read_txn().unwrap();
let mut iter = self.db.content.iter(&txn).unwrap();
while let Some(el) = iter.next() {
let (key, value) = el.unwrap();
let FacetKey { field_id, level, left_bound: bound } = key;
let bound = BoundCodec::bytes_decode(bound).unwrap();
let FacetGroupValue { size, bitmap } = value;
writeln!(
f,
"{field_id:<2} {level:<2} k{bound:<8} {size:<4} {values:?}",
values = display_bitmap(&bitmap)
)?;
}
Ok(())
}
}
} }