2022-09-08 11:53:01 +02:00
|
|
|
use std::ops::ControlFlow;
|
|
|
|
|
|
|
|
use heed::Result;
|
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
2022-08-30 14:17:40 +02:00
|
|
|
use super::{get_first_facet_value, get_highest_level};
|
2022-10-12 09:42:55 +02:00
|
|
|
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
|
|
|
|
use crate::heed_codec::ByteSliceRefCodec;
|
2022-09-07 18:04:07 +02:00
|
|
|
use crate::DocumentId;
|
2022-08-30 14:17:40 +02:00
|
|
|
|
2022-09-08 08:47:40 +02:00
|
|
|
/// Call the given closure on the facet distribution of the candidate documents.
|
|
|
|
///
|
|
|
|
/// The arguments to the closure are:
|
|
|
|
/// - the facet value, as a byte slice
|
|
|
|
/// - the number of documents among the candidates that contain this facet value
|
|
|
|
/// - the id of a document which contains the facet value. Note that this document
|
|
|
|
/// is not necessarily from the list of candidates, it is simply *any* document which
|
|
|
|
/// contains this facet value.
|
|
|
|
///
|
|
|
|
/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should
|
|
|
|
/// keep iterating over the different facet values or stop.
|
2022-08-30 14:17:40 +02:00
|
|
|
pub fn iterate_over_facet_distribution<'t, CB>(
|
|
|
|
rtxn: &'t heed::RoTxn<'t>,
|
2022-10-12 09:42:55 +02:00
|
|
|
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
2022-08-30 14:17:40 +02:00
|
|
|
field_id: u16,
|
|
|
|
candidates: &RoaringBitmap,
|
|
|
|
callback: CB,
|
2022-08-30 15:22:39 +02:00
|
|
|
) -> Result<()>
|
|
|
|
where
|
2022-09-07 17:56:38 +02:00
|
|
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
2022-08-30 14:17:40 +02:00
|
|
|
{
|
|
|
|
let mut fd = FacetDistribution { rtxn, db, field_id, callback };
|
2022-10-12 09:42:55 +02:00
|
|
|
let highest_level = get_highest_level(
|
|
|
|
rtxn,
|
|
|
|
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
|
|
|
field_id,
|
|
|
|
)?;
|
2022-08-30 14:17:40 +02:00
|
|
|
|
2022-10-12 09:42:55 +02:00
|
|
|
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
|
2022-08-31 07:50:18 +02:00
|
|
|
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
|
2022-10-27 16:58:13 +02:00
|
|
|
Ok(())
|
2022-08-30 14:17:40 +02:00
|
|
|
} else {
|
2022-10-27 16:58:13 +02:00
|
|
|
Ok(())
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct FacetDistribution<'t, CB>
|
|
|
|
where
|
2022-09-07 17:56:38 +02:00
|
|
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
2022-08-30 14:17:40 +02:00
|
|
|
{
|
|
|
|
rtxn: &'t heed::RoTxn<'t>,
|
2022-10-12 09:42:55 +02:00
|
|
|
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
2022-08-30 14:17:40 +02:00
|
|
|
field_id: u16,
|
|
|
|
callback: CB,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'t, CB> FacetDistribution<'t, CB>
|
|
|
|
where
|
2022-09-07 17:56:38 +02:00
|
|
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
2022-08-30 14:17:40 +02:00
|
|
|
{
|
|
|
|
fn iterate_level_0(
|
|
|
|
&mut self,
|
|
|
|
candidates: &RoaringBitmap,
|
|
|
|
starting_bound: &'t [u8],
|
|
|
|
group_size: usize,
|
2022-08-30 15:22:39 +02:00
|
|
|
) -> Result<ControlFlow<()>> {
|
2022-08-30 14:17:40 +02:00
|
|
|
let starting_key =
|
2022-09-05 13:01:36 +02:00
|
|
|
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
|
2022-08-30 15:22:39 +02:00
|
|
|
let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size);
|
2022-08-30 14:17:40 +02:00
|
|
|
for el in iter {
|
2022-08-30 15:22:39 +02:00
|
|
|
let (key, value) = el?;
|
2022-08-30 14:17:40 +02:00
|
|
|
// The range is unbounded on the right and the group size for the highest level is MAX,
|
|
|
|
// so we need to check that we are not iterating over the next field id
|
|
|
|
if key.field_id != self.field_id {
|
2022-08-30 15:22:39 +02:00
|
|
|
return Ok(ControlFlow::Break(()));
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
2022-10-17 12:48:10 +02:00
|
|
|
let docids_in_common = value.bitmap & candidates;
|
|
|
|
if !docids_in_common.is_empty() {
|
|
|
|
let any_docid_in_common = docids_in_common.min().unwrap();
|
|
|
|
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
|
|
|
|
{
|
|
|
|
ControlFlow::Continue(_) => (),
|
2022-08-30 15:22:39 +02:00
|
|
|
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-10-27 16:58:13 +02:00
|
|
|
Ok(ControlFlow::Continue(()))
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
fn iterate(
|
|
|
|
&mut self,
|
|
|
|
candidates: &RoaringBitmap,
|
|
|
|
level: u8,
|
|
|
|
starting_bound: &'t [u8],
|
|
|
|
group_size: usize,
|
2022-08-30 15:22:39 +02:00
|
|
|
) -> Result<ControlFlow<()>> {
|
2022-08-30 14:17:40 +02:00
|
|
|
if level == 0 {
|
|
|
|
return self.iterate_level_0(candidates, starting_bound, group_size);
|
|
|
|
}
|
2022-09-05 13:49:52 +02:00
|
|
|
let starting_key =
|
|
|
|
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound };
|
2022-10-27 16:58:13 +02:00
|
|
|
let iter = self.db.range(self.rtxn, &(&starting_key..)).unwrap().take(group_size);
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
for el in iter {
|
|
|
|
let (key, value) = el.unwrap();
|
|
|
|
// The range is unbounded on the right and the group size for the highest level is MAX,
|
|
|
|
// so we need to check that we are not iterating over the next field id
|
|
|
|
if key.field_id != self.field_id {
|
2022-08-30 15:22:39 +02:00
|
|
|
return Ok(ControlFlow::Break(()));
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
let docids_in_common = value.bitmap & candidates;
|
2022-10-27 16:58:13 +02:00
|
|
|
if !docids_in_common.is_empty() {
|
2022-08-30 15:22:39 +02:00
|
|
|
let cf = self.iterate(
|
|
|
|
&docids_in_common,
|
|
|
|
level - 1,
|
|
|
|
key.left_bound,
|
|
|
|
value.size as usize,
|
|
|
|
)?;
|
2022-08-30 14:17:40 +02:00
|
|
|
match cf {
|
|
|
|
ControlFlow::Continue(_) => {}
|
2022-08-30 15:22:39 +02:00
|
|
|
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2022-10-27 16:58:13 +02:00
|
|
|
Ok(ControlFlow::Continue(()))
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2022-09-07 18:04:07 +02:00
|
|
|
use std::ops::ControlFlow;
|
|
|
|
|
2022-09-06 11:52:57 +02:00
|
|
|
use heed::BytesDecode;
|
|
|
|
use roaring::RoaringBitmap;
|
2022-09-07 18:04:07 +02:00
|
|
|
|
|
|
|
use super::iterate_over_facet_distribution;
|
|
|
|
use crate::heed_codec::facet::OrderedF64Codec;
|
|
|
|
use crate::milli_snap;
|
|
|
|
use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn filter_distribution_all() {
|
|
|
|
let indexes = [get_simple_index(), get_random_looking_index()];
|
2022-08-31 14:19:52 +02:00
|
|
|
for (i, index) in indexes.iter().enumerate() {
|
2022-08-30 14:17:40 +02:00
|
|
|
let txn = index.env.read_txn().unwrap();
|
2023-04-25 16:40:32 +02:00
|
|
|
let candidates = (0..=255).collect::<RoaringBitmap>();
|
2022-08-30 14:17:40 +02:00
|
|
|
let mut results = String::new();
|
2022-09-07 17:56:38 +02:00
|
|
|
iterate_over_facet_distribution(
|
|
|
|
&txn,
|
|
|
|
index.content,
|
|
|
|
0,
|
|
|
|
&candidates,
|
|
|
|
|facet, count, _| {
|
|
|
|
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
|
|
|
|
results.push_str(&format!("{facet}: {count}\n"));
|
|
|
|
Ok(ControlFlow::Continue(()))
|
|
|
|
},
|
|
|
|
)
|
2022-08-31 14:19:52 +02:00
|
|
|
.unwrap();
|
2022-09-01 11:09:01 +02:00
|
|
|
milli_snap!(results, i);
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
#[test]
|
|
|
|
fn filter_distribution_all_stop_early() {
|
|
|
|
let indexes = [get_simple_index(), get_random_looking_index()];
|
2022-08-31 14:19:52 +02:00
|
|
|
for (i, index) in indexes.iter().enumerate() {
|
2022-08-30 14:17:40 +02:00
|
|
|
let txn = index.env.read_txn().unwrap();
|
2023-04-25 16:40:32 +02:00
|
|
|
let candidates = (0..=255).collect::<RoaringBitmap>();
|
2022-08-30 14:17:40 +02:00
|
|
|
let mut results = String::new();
|
|
|
|
let mut nbr_facets = 0;
|
2022-09-07 17:56:38 +02:00
|
|
|
iterate_over_facet_distribution(
|
|
|
|
&txn,
|
|
|
|
index.content,
|
|
|
|
0,
|
|
|
|
&candidates,
|
|
|
|
|facet, count, _| {
|
|
|
|
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
|
|
|
|
if nbr_facets == 100 {
|
2023-01-17 18:01:26 +01:00
|
|
|
Ok(ControlFlow::Break(()))
|
2022-09-07 17:56:38 +02:00
|
|
|
} else {
|
|
|
|
nbr_facets += 1;
|
|
|
|
results.push_str(&format!("{facet}: {count}\n"));
|
|
|
|
Ok(ControlFlow::Continue(()))
|
|
|
|
}
|
|
|
|
},
|
|
|
|
)
|
2022-08-31 14:19:52 +02:00
|
|
|
.unwrap();
|
2022-09-01 11:09:01 +02:00
|
|
|
milli_snap!(results, i);
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|