2022-08-31 09:36:19 +02:00
|
|
|
use heed::Result;
|
2022-08-30 15:22:39 +02:00
|
|
|
use roaring::RoaringBitmap;
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
use super::{get_first_facet_value, get_highest_level};
|
2022-09-05 13:01:36 +02:00
|
|
|
use crate::heed_codec::facet::{
|
2022-10-12 09:42:55 +02:00
|
|
|
FacetGroupKey, FacetGroupKeyCodec, FacetGroupValue, FacetGroupValueCodec,
|
2022-09-01 11:40:29 +02:00
|
|
|
};
|
2022-10-12 09:42:55 +02:00
|
|
|
use crate::heed_codec::ByteSliceRefCodec;
|
2022-08-30 14:17:40 +02:00
|
|
|
|
2022-09-08 08:47:40 +02:00
|
|
|
/// Return an iterator which iterates over the given candidate documents in
|
|
|
|
/// ascending order of their facet value for the given field id.
|
|
|
|
///
|
|
|
|
/// The documents returned by the iterator are grouped by the facet values that
|
|
|
|
/// determined their rank. For example, given the documents:
|
|
|
|
///
|
|
|
|
/// ```ignore
|
|
|
|
/// 0: { "colour": ["blue", "green"] }
|
|
|
|
/// 1: { "colour": ["blue", "red"] }
|
|
|
|
/// 2: { "colour": ["orange", "red"] }
|
|
|
|
/// 3: { "colour": ["green", "red"] }
|
|
|
|
/// 4: { "colour": ["blue", "orange", "red"] }
|
|
|
|
/// ```
|
|
|
|
/// Then calling the function on the candidates `[0, 2, 3, 4]` will return an iterator
|
|
|
|
/// over the following elements:
|
|
|
|
/// ```ignore
|
|
|
|
/// [0, 4] // corresponds to all the documents within the candidates that have the facet value "blue"
|
|
|
|
/// [3] // same for "green"
|
|
|
|
/// [2] // same for "orange"
|
|
|
|
/// END
|
|
|
|
/// ```
|
|
|
|
/// Note that once a document id is returned by the iterator, it is never returned again.
|
2022-08-30 14:17:40 +02:00
|
|
|
pub fn ascending_facet_sort<'t>(
|
|
|
|
rtxn: &'t heed::RoTxn<'t>,
|
2022-10-12 09:42:55 +02:00
|
|
|
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
2022-08-30 14:17:40 +02:00
|
|
|
field_id: u16,
|
|
|
|
candidates: RoaringBitmap,
|
2023-02-01 14:19:38 +01:00
|
|
|
) -> Result<impl Iterator<Item = Result<(RoaringBitmap, &'t [u8])>> + 't> {
|
2022-08-31 09:36:19 +02:00
|
|
|
let highest_level = get_highest_level(rtxn, db, field_id)?;
|
2022-10-12 09:42:55 +02:00
|
|
|
if let Some(first_bound) = get_first_facet_value::<ByteSliceRefCodec>(rtxn, db, field_id)? {
|
2022-09-05 13:01:36 +02:00
|
|
|
let first_key = FacetGroupKey { field_id, level: highest_level, left_bound: first_bound };
|
2022-08-30 14:17:40 +02:00
|
|
|
let iter = db.range(rtxn, &(first_key..)).unwrap().take(usize::MAX);
|
|
|
|
|
2023-02-01 14:19:38 +01:00
|
|
|
Ok(itertools::Either::Left(AscendingFacetSort {
|
|
|
|
rtxn,
|
|
|
|
db,
|
|
|
|
field_id,
|
|
|
|
stack: vec![(candidates, iter)],
|
|
|
|
}))
|
2022-08-30 14:17:40 +02:00
|
|
|
} else {
|
2023-02-01 14:19:38 +01:00
|
|
|
Ok(itertools::Either::Right(std::iter::empty()))
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
struct AscendingFacetSort<'t, 'e> {
|
|
|
|
rtxn: &'t heed::RoTxn<'e>,
|
2022-10-12 09:42:55 +02:00
|
|
|
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
2022-08-30 14:17:40 +02:00
|
|
|
field_id: u16,
|
2022-10-27 16:58:13 +02:00
|
|
|
#[allow(clippy::type_complexity)]
|
2022-08-30 14:17:40 +02:00
|
|
|
stack: Vec<(
|
|
|
|
RoaringBitmap,
|
2022-10-12 09:42:55 +02:00
|
|
|
std::iter::Take<
|
|
|
|
heed::RoRange<'t, FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
|
|
|
>,
|
2022-08-30 14:17:40 +02:00
|
|
|
)>,
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'t, 'e> Iterator for AscendingFacetSort<'t, 'e> {
|
2023-02-01 14:40:42 +01:00
|
|
|
type Item = Result<(RoaringBitmap, &'t [u8])>;
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
fn next(&mut self) -> Option<Self::Item> {
|
|
|
|
'outer: loop {
|
|
|
|
let (documents_ids, deepest_iter) = self.stack.last_mut()?;
|
|
|
|
for result in deepest_iter {
|
|
|
|
let (
|
2022-09-05 13:01:36 +02:00
|
|
|
FacetGroupKey { level, left_bound, field_id },
|
2022-08-30 14:17:40 +02:00
|
|
|
FacetGroupValue { size: group_size, mut bitmap },
|
|
|
|
) = result.unwrap();
|
|
|
|
// The range is unbounded on the right and the group size for the highest level is MAX,
|
|
|
|
// so we need to check that we are not iterating over the next field id
|
|
|
|
if field_id != self.field_id {
|
|
|
|
return None;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If the last iterator found an empty set of documents it means
|
|
|
|
// that we found all the documents in the sub level iterations already,
|
|
|
|
// we can pop this level iterator.
|
|
|
|
if documents_ids.is_empty() {
|
2022-12-05 10:33:31 +01:00
|
|
|
// break our of the for loop into the end of the 'outer loop, which
|
|
|
|
// pops the stack
|
2022-08-30 14:17:40 +02:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
bitmap &= &*documents_ids;
|
|
|
|
if !bitmap.is_empty() {
|
|
|
|
*documents_ids -= &bitmap;
|
|
|
|
|
|
|
|
if level == 0 {
|
2023-02-01 14:40:42 +01:00
|
|
|
// Since the level is 0, the left_bound is the exact value.
|
|
|
|
return Some(Ok((bitmap, left_bound)));
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
|
|
|
let starting_key_below =
|
2022-09-05 13:01:36 +02:00
|
|
|
FacetGroupKey { field_id: self.field_id, level: level - 1, left_bound };
|
2022-10-27 16:58:13 +02:00
|
|
|
let iter = match self.db.range(self.rtxn, &(starting_key_below..)) {
|
2022-08-30 15:22:39 +02:00
|
|
|
Ok(iter) => iter,
|
2022-10-27 16:58:13 +02:00
|
|
|
Err(e) => return Some(Err(e)),
|
2022-08-30 15:22:39 +02:00
|
|
|
}
|
|
|
|
.take(group_size as usize);
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
self.stack.push((bitmap, iter));
|
|
|
|
continue 'outer;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
self.stack.pop();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
mod tests {
|
2022-09-07 18:04:07 +02:00
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
2022-09-01 11:40:29 +02:00
|
|
|
use crate::milli_snap;
|
|
|
|
use crate::search::facet::facet_sort_ascending::ascending_facet_sort;
|
2022-12-05 10:33:31 +01:00
|
|
|
use crate::search::facet::tests::{
|
|
|
|
get_random_looking_index, get_random_looking_string_index_with_multiple_field_ids,
|
|
|
|
get_simple_index, get_simple_string_index_with_multiple_field_ids,
|
|
|
|
};
|
2022-09-01 11:40:29 +02:00
|
|
|
use crate::snapshot_tests::display_bitmap;
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
#[test]
|
2022-12-05 10:33:31 +01:00
|
|
|
fn filter_sort_ascending() {
|
2022-08-30 14:17:40 +02:00
|
|
|
let indexes = [get_simple_index(), get_random_looking_index()];
|
2022-08-31 14:19:52 +02:00
|
|
|
for (i, index) in indexes.iter().enumerate() {
|
2022-08-30 14:17:40 +02:00
|
|
|
let txn = index.env.read_txn().unwrap();
|
2023-04-25 16:40:32 +02:00
|
|
|
let candidates = (200..=300).collect::<RoaringBitmap>();
|
2022-08-30 14:17:40 +02:00
|
|
|
let mut results = String::new();
|
2022-09-06 11:52:57 +02:00
|
|
|
let iter = ascending_facet_sort(&txn, index.content, 0, candidates).unwrap();
|
2022-08-31 14:19:52 +02:00
|
|
|
for el in iter {
|
2023-02-01 14:40:42 +01:00
|
|
|
let (docids, _) = el.unwrap();
|
2022-08-31 14:19:52 +02:00
|
|
|
results.push_str(&display_bitmap(&docids));
|
2022-09-01 11:09:01 +02:00
|
|
|
results.push('\n');
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|
2022-09-01 11:09:01 +02:00
|
|
|
milli_snap!(results, i);
|
2022-08-30 14:17:40 +02:00
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
}
|
|
|
|
}
|
2022-12-05 10:33:31 +01:00
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn filter_sort_ascending_multiple_field_ids() {
|
|
|
|
let indexes = [
|
|
|
|
get_simple_string_index_with_multiple_field_ids(),
|
|
|
|
get_random_looking_string_index_with_multiple_field_ids(),
|
|
|
|
];
|
|
|
|
for (i, index) in indexes.iter().enumerate() {
|
|
|
|
let txn = index.env.read_txn().unwrap();
|
2023-04-25 16:40:32 +02:00
|
|
|
let candidates = (200..=300).collect::<RoaringBitmap>();
|
2022-12-05 10:33:31 +01:00
|
|
|
let mut results = String::new();
|
|
|
|
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
|
|
|
for el in iter {
|
2023-02-01 14:40:42 +01:00
|
|
|
let (docids, _) = el.unwrap();
|
2022-12-05 10:33:31 +01:00
|
|
|
results.push_str(&display_bitmap(&docids));
|
|
|
|
results.push('\n');
|
|
|
|
}
|
|
|
|
milli_snap!(results, format!("{i}-0"));
|
|
|
|
|
|
|
|
let mut results = String::new();
|
|
|
|
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
|
|
|
for el in iter {
|
2023-02-01 14:40:42 +01:00
|
|
|
let (docids, _) = el.unwrap();
|
2022-12-05 10:33:31 +01:00
|
|
|
results.push_str(&display_bitmap(&docids));
|
|
|
|
results.push('\n');
|
|
|
|
}
|
|
|
|
milli_snap!(results, format!("{i}-1"));
|
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn filter_sort_ascending_with_no_candidates() {
|
|
|
|
let indexes = [
|
|
|
|
get_simple_string_index_with_multiple_field_ids(),
|
|
|
|
get_random_looking_string_index_with_multiple_field_ids(),
|
|
|
|
];
|
|
|
|
for (_i, index) in indexes.iter().enumerate() {
|
|
|
|
let txn = index.env.read_txn().unwrap();
|
|
|
|
let candidates = RoaringBitmap::new();
|
|
|
|
let mut results = String::new();
|
|
|
|
let iter = ascending_facet_sort(&txn, index.content, 0, candidates.clone()).unwrap();
|
|
|
|
for el in iter {
|
2023-02-01 14:40:42 +01:00
|
|
|
let (docids, _) = el.unwrap();
|
2022-12-05 10:33:31 +01:00
|
|
|
results.push_str(&display_bitmap(&docids));
|
|
|
|
results.push('\n');
|
|
|
|
}
|
|
|
|
assert!(results.is_empty());
|
|
|
|
|
|
|
|
let mut results = String::new();
|
|
|
|
let iter = ascending_facet_sort(&txn, index.content, 1, candidates).unwrap();
|
|
|
|
for el in iter {
|
2023-02-01 14:40:42 +01:00
|
|
|
let (docids, _) = el.unwrap();
|
2022-12-05 10:33:31 +01:00
|
|
|
results.push_str(&display_bitmap(&docids));
|
|
|
|
results.push('\n');
|
|
|
|
}
|
|
|
|
assert!(results.is_empty());
|
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
fn filter_sort_ascending_with_inexisting_field_id() {
|
|
|
|
let indexes = [
|
|
|
|
get_simple_string_index_with_multiple_field_ids(),
|
|
|
|
get_random_looking_string_index_with_multiple_field_ids(),
|
|
|
|
];
|
|
|
|
for (_i, index) in indexes.iter().enumerate() {
|
|
|
|
let txn = index.env.read_txn().unwrap();
|
|
|
|
let candidates = RoaringBitmap::new();
|
|
|
|
let mut results = String::new();
|
|
|
|
let iter = ascending_facet_sort(&txn, index.content, 3, candidates.clone()).unwrap();
|
|
|
|
for el in iter {
|
2023-02-01 14:40:42 +01:00
|
|
|
let (docids, _) = el.unwrap();
|
2022-12-05 10:33:31 +01:00
|
|
|
results.push_str(&display_bitmap(&docids));
|
|
|
|
results.push('\n');
|
|
|
|
}
|
|
|
|
assert!(results.is_empty());
|
|
|
|
|
|
|
|
txn.commit().unwrap();
|
|
|
|
}
|
|
|
|
}
|
2022-08-30 14:17:40 +02:00
|
|
|
}
|