MeiliSearch/milli/src/search/facet/facet_distribution_iter.rs

291 lines
11 KiB
Rust
Raw Normal View History

2023-05-25 12:28:26 +02:00
use std::cmp::Reverse;
use std::collections::BinaryHeap;
use std::ops::ControlFlow;
use heed::Result;
use roaring::RoaringBitmap;
use super::{get_first_facet_value, get_highest_level};
use crate::heed_codec::facet::{FacetGroupKey, FacetGroupKeyCodec, FacetGroupValueCodec};
use crate::heed_codec::ByteSliceRefCodec;
2022-09-07 18:04:07 +02:00
use crate::DocumentId;
/// Call the given closure on the facet distribution of the candidate documents.
///
/// The arguments to the closure are:
/// - the facet value, as a byte slice
/// - the number of documents among the candidates that contain this facet value
/// - the id of a document which contains the facet value. Note that this document
/// is not necessarily from the list of candidates, it is simply *any* document which
/// contains this facet value.
///
/// The return value of the closure is a `ControlFlow<()>` which indicates whether we should
/// keep iterating over the different facet values or stop.
pub fn lexicographically_iterate_over_facet_distribution<'t, CB>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
callback: CB,
) -> Result<()>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
let mut fd = LexicographicFacetDistribution { rtxn, db, field_id, callback };
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
2022-10-27 16:58:13 +02:00
Ok(())
} else {
2022-10-27 16:58:13 +02:00
Ok(())
}
}
pub fn count_iterate_over_facet_distribution<'t, CB>(
2023-05-25 12:28:26 +02:00
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
mut callback: CB,
) -> Result<()>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
2023-05-25 12:28:26 +02:00
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
struct LevelEntry<'t> {
/// The number of candidates in this entry.
count: u64,
/// The key level of the entry.
level: Reverse<u8>,
/// The left bound key.
left_bound: &'t [u8],
/// The number of keys we must look for after `left_bound`.
group_size: u8,
/// Any docid in the set of matching documents. Used to find the original facet string.
any_docid: u32,
2023-05-25 12:28:26 +02:00
}
// Represents the list of keys that we must explore.
let mut heap = BinaryHeap::new();
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
)?;
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
// We first fill the heap with values from the highest level
let starting_key =
FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
for el in db.range(rtxn, &(&starting_key..)).unwrap().take(usize::MAX) {
let (key, value) = el.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != field_id {
break;
}
let intersection = value.bitmap & candidates;
let count = intersection.len();
2023-05-25 12:28:26 +02:00
if count != 0 {
heap.push(LevelEntry {
count,
level: Reverse(key.level),
left_bound: key.left_bound,
group_size: value.size,
any_docid: intersection.min().unwrap(),
2023-05-25 12:28:26 +02:00
});
}
}
while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
{
2023-05-25 12:28:26 +02:00
if let Reverse(0) = level {
match (callback)(left_bound, count, any_docid)? {
ControlFlow::Continue(_) => (),
ControlFlow::Break(_) => return Ok(()),
2023-05-25 12:28:26 +02:00
}
} else {
2023-05-29 15:51:00 +02:00
let starting_key = FacetGroupKey { field_id, level: level.0 - 1, left_bound };
2023-05-25 12:28:26 +02:00
for el in db.range(rtxn, &(&starting_key..)).unwrap().take(group_size as usize) {
let (key, value) = el.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != field_id {
break;
}
let intersection = value.bitmap & candidates;
let count = intersection.len();
2023-05-25 12:28:26 +02:00
if count != 0 {
heap.push(LevelEntry {
count,
level: Reverse(key.level),
left_bound: key.left_bound,
group_size: value.size,
any_docid: intersection.min().unwrap(),
2023-05-25 12:28:26 +02:00
});
}
}
}
}
}
Ok(())
2023-05-25 12:28:26 +02:00
}
/// Iterate over the facets values by lexicographic order.
struct LexicographicFacetDistribution<'t, CB>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
callback: CB,
}
impl<'t, CB> LexicographicFacetDistribution<'t, CB>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
fn iterate_level_0(
&mut self,
candidates: &RoaringBitmap,
starting_bound: &'t [u8],
group_size: usize,
) -> Result<ControlFlow<()>> {
let starting_key =
2022-09-05 13:01:36 +02:00
FacetGroupKey { field_id: self.field_id, level: 0, left_bound: starting_bound };
let iter = self.db.range(self.rtxn, &(starting_key..))?.take(group_size);
for el in iter {
let (key, value) = el?;
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
if !docids_in_common.is_empty() {
let any_docid_in_common = docids_in_common.min().unwrap();
match (self.callback)(key.left_bound, docids_in_common.len(), any_docid_in_common)?
{
ControlFlow::Continue(_) => (),
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
}
}
}
2022-10-27 16:58:13 +02:00
Ok(ControlFlow::Continue(()))
}
fn iterate(
&mut self,
candidates: &RoaringBitmap,
level: u8,
starting_bound: &'t [u8],
group_size: usize,
) -> Result<ControlFlow<()>> {
if level == 0 {
return self.iterate_level_0(candidates, starting_bound, group_size);
}
let starting_key =
FacetGroupKey { field_id: self.field_id, level, left_bound: starting_bound };
2022-10-27 16:58:13 +02:00
let iter = self.db.range(self.rtxn, &(&starting_key..)).unwrap().take(group_size);
for el in iter {
let (key, value) = el.unwrap();
// The range is unbounded on the right and the group size for the highest level is MAX,
// so we need to check that we are not iterating over the next field id
if key.field_id != self.field_id {
return Ok(ControlFlow::Break(()));
}
let docids_in_common = value.bitmap & candidates;
2022-10-27 16:58:13 +02:00
if !docids_in_common.is_empty() {
let cf = self.iterate(
&docids_in_common,
level - 1,
key.left_bound,
value.size as usize,
)?;
match cf {
ControlFlow::Continue(_) => (),
ControlFlow::Break(_) => return Ok(ControlFlow::Break(())),
}
}
}
2022-10-27 16:58:13 +02:00
Ok(ControlFlow::Continue(()))
}
}
#[cfg(test)]
mod tests {
2022-09-07 18:04:07 +02:00
use std::ops::ControlFlow;
2022-09-06 11:52:57 +02:00
use heed::BytesDecode;
use roaring::RoaringBitmap;
2022-09-07 18:04:07 +02:00
use super::lexicographically_iterate_over_facet_distribution;
2022-09-07 18:04:07 +02:00
use crate::heed_codec::facet::OrderedF64Codec;
use crate::milli_snap;
use crate::search::facet::tests::{get_random_looking_index, get_simple_index};
#[test]
fn filter_distribution_all() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
2023-04-25 16:40:32 +02:00
let candidates = (0..=255).collect::<RoaringBitmap>();
let mut results = String::new();
lexicographically_iterate_over_facet_distribution(
&txn,
index.content,
0,
&candidates,
|facet, count, _| {
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
results.push_str(&format!("{facet}: {count}\n"));
Ok(ControlFlow::Continue(()))
},
)
.unwrap();
2022-09-01 11:09:01 +02:00
milli_snap!(results, i);
txn.commit().unwrap();
}
}
#[test]
fn filter_distribution_all_stop_early() {
let indexes = [get_simple_index(), get_random_looking_index()];
for (i, index) in indexes.iter().enumerate() {
let txn = index.env.read_txn().unwrap();
2023-04-25 16:40:32 +02:00
let candidates = (0..=255).collect::<RoaringBitmap>();
let mut results = String::new();
let mut nbr_facets = 0;
lexicographically_iterate_over_facet_distribution(
&txn,
index.content,
0,
&candidates,
|facet, count, _| {
let facet = OrderedF64Codec::bytes_decode(facet).unwrap();
if nbr_facets == 100 {
2023-01-17 18:01:26 +01:00
Ok(ControlFlow::Break(()))
} else {
nbr_facets += 1;
results.push_str(&format!("{facet}: {count}\n"));
Ok(ControlFlow::Continue(()))
}
},
)
.unwrap();
2022-09-01 11:09:01 +02:00
milli_snap!(results, i);
txn.commit().unwrap();
}
}
}