2021-02-19 15:45:15 +01:00
|
|
|
use std::mem::take;
|
|
|
|
|
|
|
|
use itertools::Itertools;
|
2021-03-03 11:43:42 +01:00
|
|
|
use log::debug;
|
2021-02-19 15:45:15 +01:00
|
|
|
use ordered_float::OrderedFloat;
|
|
|
|
use roaring::RoaringBitmap;
|
|
|
|
|
2021-06-16 18:33:33 +02:00
|
|
|
use super::{Criterion, CriterionParameters, CriterionResult};
|
2022-09-05 12:51:40 +02:00
|
|
|
use crate::facet::FacetType;
|
2022-09-05 17:31:26 +02:00
|
|
|
use crate::heed_codec::facet::{ByteSliceRef, FacetGroupKeyCodec};
|
2021-03-03 11:43:42 +01:00
|
|
|
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
|
2022-08-31 09:36:19 +02:00
|
|
|
use crate::search::facet::facet_sort_ascending::ascending_facet_sort;
|
|
|
|
use crate::search::facet::facet_sort_descending::descending_facet_sort;
|
2022-08-29 16:01:54 +02:00
|
|
|
// use crate::search::facet::FacetStringIter;
|
2021-02-19 15:45:15 +01:00
|
|
|
use crate::search::query_tree::Operation;
|
2021-06-14 16:46:19 +02:00
|
|
|
use crate::{FieldId, Index, Result};
|
2021-02-19 15:45:15 +01:00
|
|
|
|
2021-03-30 10:57:10 +02:00
|
|
|
/// Threshold on the number of candidates that will make
|
|
|
|
/// the system to choose between one algorithm or another.
|
|
|
|
const CANDIDATES_THRESHOLD: u64 = 1000;
|
2021-03-29 18:07:22 +02:00
|
|
|
|
2021-02-19 15:45:15 +01:00
|
|
|
pub struct AscDesc<'t> {
|
|
|
|
index: &'t Index,
|
|
|
|
rtxn: &'t heed::RoTxn<'t>,
|
2021-03-04 10:13:34 +01:00
|
|
|
field_name: String,
|
2021-07-22 17:11:17 +02:00
|
|
|
field_id: Option<FieldId>,
|
2021-08-23 11:37:18 +02:00
|
|
|
is_ascending: bool,
|
2021-02-19 15:45:15 +01:00
|
|
|
query_tree: Option<Operation>,
|
2021-03-04 11:00:18 +01:00
|
|
|
candidates: Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>,
|
2021-06-02 16:30:56 +02:00
|
|
|
allowed_candidates: RoaringBitmap,
|
2021-02-25 16:47:34 +01:00
|
|
|
bucket_candidates: RoaringBitmap,
|
2021-02-19 15:45:15 +01:00
|
|
|
faceted_candidates: RoaringBitmap,
|
2021-03-23 15:25:46 +01:00
|
|
|
parent: Box<dyn Criterion + 't>,
|
2021-02-19 15:45:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
impl<'t> AscDesc<'t> {
|
|
|
|
pub fn asc(
|
|
|
|
index: &'t Index,
|
|
|
|
rtxn: &'t heed::RoTxn,
|
|
|
|
parent: Box<dyn Criterion + 't>,
|
2021-03-04 10:13:34 +01:00
|
|
|
field_name: String,
|
2021-06-14 16:46:19 +02:00
|
|
|
) -> Result<Self> {
|
2021-03-04 10:13:34 +01:00
|
|
|
Self::new(index, rtxn, parent, field_name, true)
|
2021-02-19 15:45:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
pub fn desc(
|
|
|
|
index: &'t Index,
|
|
|
|
rtxn: &'t heed::RoTxn,
|
|
|
|
parent: Box<dyn Criterion + 't>,
|
2021-03-04 10:13:34 +01:00
|
|
|
field_name: String,
|
2021-06-14 16:46:19 +02:00
|
|
|
) -> Result<Self> {
|
2021-03-04 10:13:34 +01:00
|
|
|
Self::new(index, rtxn, parent, field_name, false)
|
2021-02-19 15:45:15 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
fn new(
|
|
|
|
index: &'t Index,
|
|
|
|
rtxn: &'t heed::RoTxn,
|
|
|
|
parent: Box<dyn Criterion + 't>,
|
2021-03-04 10:13:34 +01:00
|
|
|
field_name: String,
|
2021-08-23 11:37:18 +02:00
|
|
|
is_ascending: bool,
|
2021-06-14 16:46:19 +02:00
|
|
|
) -> Result<Self> {
|
2021-03-04 10:13:34 +01:00
|
|
|
let fields_ids_map = index.fields_ids_map(rtxn)?;
|
2021-07-22 17:11:17 +02:00
|
|
|
let field_id = fields_ids_map.id(&field_name);
|
|
|
|
let faceted_candidates = match field_id {
|
2021-08-23 11:37:18 +02:00
|
|
|
Some(field_id) => {
|
2022-09-05 12:51:40 +02:00
|
|
|
let number_faceted =
|
|
|
|
index.faceted_documents_ids(rtxn, field_id, FacetType::Number)?;
|
|
|
|
let string_faceted =
|
|
|
|
index.faceted_documents_ids(rtxn, field_id, FacetType::String)?;
|
2021-08-23 11:37:18 +02:00
|
|
|
number_faceted | string_faceted
|
|
|
|
}
|
2021-07-22 17:11:17 +02:00
|
|
|
None => RoaringBitmap::default(),
|
|
|
|
};
|
2021-03-04 10:13:34 +01:00
|
|
|
|
2021-02-19 15:45:15 +01:00
|
|
|
Ok(AscDesc {
|
|
|
|
index,
|
|
|
|
rtxn,
|
2021-03-04 10:13:34 +01:00
|
|
|
field_name,
|
2021-02-19 15:45:15 +01:00
|
|
|
field_id,
|
2021-08-23 11:37:18 +02:00
|
|
|
is_ascending,
|
2021-02-19 15:45:15 +01:00
|
|
|
query_tree: None,
|
2021-03-04 11:00:18 +01:00
|
|
|
candidates: Box::new(std::iter::empty()),
|
2021-06-02 16:30:56 +02:00
|
|
|
allowed_candidates: RoaringBitmap::new(),
|
2021-07-22 17:11:17 +02:00
|
|
|
faceted_candidates,
|
2021-02-25 16:47:34 +01:00
|
|
|
bucket_candidates: RoaringBitmap::new(),
|
2021-03-23 15:25:46 +01:00
|
|
|
parent,
|
2021-02-19 15:45:15 +01:00
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
impl<'t> Criterion for AscDesc<'t> {
|
2021-03-06 11:28:22 +01:00
|
|
|
#[logging_timer::time("AscDesc::{}")]
|
2021-06-14 16:46:19 +02:00
|
|
|
fn next(&mut self, params: &mut CriterionParameters) -> Result<Option<CriterionResult>> {
|
2021-06-02 16:30:56 +02:00
|
|
|
// remove excluded candidates when next is called, instead of doing it in the loop.
|
|
|
|
self.allowed_candidates -= params.excluded_candidates;
|
|
|
|
|
2021-02-19 15:45:15 +01:00
|
|
|
loop {
|
2021-04-28 10:20:31 +02:00
|
|
|
debug!(
|
|
|
|
"Facet {}({}) iteration",
|
2021-08-23 11:37:18 +02:00
|
|
|
if self.is_ascending { "Asc" } else { "Desc" },
|
2021-04-28 10:20:31 +02:00
|
|
|
self.field_name
|
2021-03-03 11:43:42 +01:00
|
|
|
);
|
|
|
|
|
2021-03-04 11:00:18 +01:00
|
|
|
match self.candidates.next().transpose()? {
|
2021-06-02 16:30:56 +02:00
|
|
|
None if !self.allowed_candidates.is_empty() => {
|
|
|
|
return Ok(Some(CriterionResult {
|
|
|
|
query_tree: self.query_tree.clone(),
|
|
|
|
candidates: Some(take(&mut self.allowed_candidates)),
|
|
|
|
filtered_candidates: None,
|
|
|
|
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
|
|
|
}));
|
2021-06-16 18:33:33 +02:00
|
|
|
}
|
|
|
|
None => match self.parent.next(params)? {
|
|
|
|
Some(CriterionResult {
|
|
|
|
query_tree,
|
|
|
|
candidates,
|
|
|
|
filtered_candidates,
|
|
|
|
bucket_candidates,
|
|
|
|
}) => {
|
|
|
|
self.query_tree = query_tree;
|
|
|
|
let mut candidates = match (&self.query_tree, candidates) {
|
|
|
|
(_, Some(candidates)) => candidates,
|
|
|
|
(Some(qt), None) => {
|
|
|
|
let context = CriteriaBuilder::new(&self.rtxn, &self.index)?;
|
|
|
|
resolve_query_tree(&context, qt, params.wdcache)?
|
2021-03-23 15:25:46 +01:00
|
|
|
}
|
2021-06-16 18:33:33 +02:00
|
|
|
(None, None) => self.index.documents_ids(self.rtxn)?,
|
|
|
|
};
|
|
|
|
|
|
|
|
if let Some(filtered_candidates) = filtered_candidates {
|
|
|
|
candidates &= filtered_candidates;
|
|
|
|
}
|
|
|
|
|
|
|
|
match bucket_candidates {
|
|
|
|
Some(bucket_candidates) => self.bucket_candidates |= bucket_candidates,
|
|
|
|
None => self.bucket_candidates |= &candidates,
|
|
|
|
}
|
|
|
|
|
|
|
|
if candidates.is_empty() {
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
self.allowed_candidates = &candidates - params.excluded_candidates;
|
2021-07-22 17:11:17 +02:00
|
|
|
self.candidates = match self.field_id {
|
|
|
|
Some(field_id) => facet_ordered(
|
|
|
|
self.index,
|
|
|
|
self.rtxn,
|
|
|
|
field_id,
|
2021-08-23 11:37:18 +02:00
|
|
|
self.is_ascending,
|
2021-07-22 17:11:17 +02:00
|
|
|
candidates & &self.faceted_candidates,
|
|
|
|
)?,
|
|
|
|
None => Box::new(std::iter::empty()),
|
|
|
|
};
|
2021-03-03 11:43:42 +01:00
|
|
|
}
|
2021-06-16 18:33:33 +02:00
|
|
|
None => return Ok(None),
|
2021-02-19 15:45:15 +01:00
|
|
|
},
|
2021-04-28 18:01:23 +02:00
|
|
|
Some(mut candidates) => {
|
|
|
|
candidates -= params.excluded_candidates;
|
2021-06-02 16:30:56 +02:00
|
|
|
self.allowed_candidates -= &candidates;
|
2021-02-19 15:45:15 +01:00
|
|
|
return Ok(Some(CriterionResult {
|
2021-03-03 11:43:42 +01:00
|
|
|
query_tree: self.query_tree.clone(),
|
2021-03-09 12:04:52 +01:00
|
|
|
candidates: Some(candidates),
|
2021-05-10 12:33:37 +02:00
|
|
|
filtered_candidates: None,
|
2021-05-05 20:46:56 +02:00
|
|
|
bucket_candidates: Some(take(&mut self.bucket_candidates)),
|
2021-02-19 15:45:15 +01:00
|
|
|
}));
|
2021-04-28 10:20:31 +02:00
|
|
|
}
|
2021-02-19 15:45:15 +01:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2021-03-04 11:00:18 +01:00
|
|
|
/// Returns an iterator over groups of the given candidates in ascending or descending order.
|
|
|
|
///
|
2021-03-29 18:07:22 +02:00
|
|
|
/// It will either use an iterative or a recursive method on the whole facet database depending
|
2021-03-04 11:00:18 +01:00
|
|
|
/// on the number of candidates to rank.
|
|
|
|
fn facet_ordered<'t>(
|
|
|
|
index: &'t Index,
|
|
|
|
rtxn: &'t heed::RoTxn,
|
2021-02-19 15:45:15 +01:00
|
|
|
field_id: FieldId,
|
2021-08-23 11:37:18 +02:00
|
|
|
is_ascending: bool,
|
2021-02-19 15:45:15 +01:00
|
|
|
candidates: RoaringBitmap,
|
2021-06-14 16:46:19 +02:00
|
|
|
) -> Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
|
2021-04-28 17:58:16 +02:00
|
|
|
if candidates.len() <= CANDIDATES_THRESHOLD {
|
2021-08-23 11:37:18 +02:00
|
|
|
let number_iter = iterative_facet_number_ordered_iter(
|
|
|
|
index,
|
|
|
|
rtxn,
|
|
|
|
field_id,
|
|
|
|
is_ascending,
|
|
|
|
candidates.clone(),
|
|
|
|
)?;
|
|
|
|
let string_iter =
|
|
|
|
iterative_facet_string_ordered_iter(index, rtxn, field_id, is_ascending, candidates)?;
|
|
|
|
Ok(Box::new(number_iter.chain(string_iter).map(Ok)) as Box<dyn Iterator<Item = _>>)
|
2021-04-28 17:58:16 +02:00
|
|
|
} else {
|
2022-08-31 09:36:19 +02:00
|
|
|
let make_iter = if is_ascending { ascending_facet_sort } else { descending_facet_sort };
|
|
|
|
|
|
|
|
let number_iter = make_iter(
|
|
|
|
rtxn,
|
2022-09-05 13:01:36 +02:00
|
|
|
index.facet_id_f64_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
|
2022-08-31 09:36:19 +02:00
|
|
|
field_id,
|
|
|
|
candidates.clone(),
|
|
|
|
)?;
|
2022-09-01 12:51:54 +02:00
|
|
|
|
2022-08-31 09:36:19 +02:00
|
|
|
let string_iter = make_iter(
|
|
|
|
rtxn,
|
2022-09-05 13:01:36 +02:00
|
|
|
index.facet_id_string_docids.remap_key_type::<FacetGroupKeyCodec<ByteSliceRef>>(),
|
2022-08-31 09:36:19 +02:00
|
|
|
field_id,
|
|
|
|
candidates,
|
|
|
|
)?;
|
|
|
|
|
|
|
|
Ok(Box::new(number_iter.chain(string_iter)))
|
2021-02-19 15:45:15 +01:00
|
|
|
}
|
|
|
|
}
|
2021-03-04 11:00:18 +01:00
|
|
|
|
2021-08-23 11:37:18 +02:00
|
|
|
/// Fetch the whole list of candidates facet number values one by one and order them by it.
|
2021-03-04 11:00:18 +01:00
|
|
|
///
|
|
|
|
/// This function is fast when the amount of candidates to rank is small.
|
2021-08-23 11:37:18 +02:00
|
|
|
fn iterative_facet_number_ordered_iter<'t>(
|
2021-03-04 11:00:18 +01:00
|
|
|
index: &'t Index,
|
|
|
|
rtxn: &'t heed::RoTxn,
|
|
|
|
field_id: FieldId,
|
2021-08-23 11:37:18 +02:00
|
|
|
is_ascending: bool,
|
2021-03-04 11:00:18 +01:00
|
|
|
candidates: RoaringBitmap,
|
2021-06-14 16:46:19 +02:00
|
|
|
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
|
2021-03-04 11:00:18 +01:00
|
|
|
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
|
|
|
|
for docid in candidates.iter() {
|
2021-04-07 11:57:16 +02:00
|
|
|
let left = (field_id, docid, f64::MIN);
|
|
|
|
let right = (field_id, docid, f64::MAX);
|
2021-06-16 18:33:33 +02:00
|
|
|
let mut iter = index.field_id_docid_facet_f64s.range(rtxn, &(left..=right))?;
|
2021-08-23 11:37:18 +02:00
|
|
|
let entry = if is_ascending { iter.next() } else { iter.last() };
|
2021-03-04 11:00:18 +01:00
|
|
|
if let Some(((_, _, value), ())) = entry.transpose()? {
|
2021-04-07 11:57:16 +02:00
|
|
|
docids_values.push((docid, OrderedFloat(value)));
|
2021-03-04 11:00:18 +01:00
|
|
|
}
|
|
|
|
}
|
2021-05-10 10:27:18 +02:00
|
|
|
docids_values.sort_unstable_by_key(|(_, v)| *v);
|
2021-03-04 11:00:18 +01:00
|
|
|
let iter = docids_values.into_iter();
|
2021-08-23 11:37:18 +02:00
|
|
|
let iter = if is_ascending {
|
|
|
|
Box::new(iter) as Box<dyn Iterator<Item = _>>
|
|
|
|
} else {
|
|
|
|
Box::new(iter.rev())
|
|
|
|
};
|
|
|
|
|
|
|
|
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
|
|
|
|
// required to collect the result into an owned collection (a Vec).
|
|
|
|
// https://github.com/rust-itertools/itertools/issues/499
|
|
|
|
let vec: Vec<_> = iter
|
|
|
|
.group_by(|(_, v)| *v)
|
|
|
|
.into_iter()
|
|
|
|
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
Ok(vec.into_iter())
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Fetch the whole list of candidates facet string values one by one and order them by it.
|
|
|
|
///
|
|
|
|
/// This function is fast when the amount of candidates to rank is small.
|
|
|
|
fn iterative_facet_string_ordered_iter<'t>(
|
|
|
|
index: &'t Index,
|
|
|
|
rtxn: &'t heed::RoTxn,
|
|
|
|
field_id: FieldId,
|
|
|
|
is_ascending: bool,
|
|
|
|
candidates: RoaringBitmap,
|
|
|
|
) -> Result<impl Iterator<Item = RoaringBitmap> + 't> {
|
|
|
|
let mut docids_values = Vec::with_capacity(candidates.len() as usize);
|
|
|
|
for docid in candidates.iter() {
|
|
|
|
let left = (field_id, docid, "");
|
|
|
|
let right = (field_id, docid.saturating_add(1), "");
|
|
|
|
// FIXME Doing this means that it will never be possible to retrieve
|
|
|
|
// the document with id 2^32, not sure this is a real problem.
|
|
|
|
let mut iter = index.field_id_docid_facet_strings.range(rtxn, &(left..right))?;
|
|
|
|
let entry = if is_ascending { iter.next() } else { iter.last() };
|
|
|
|
if let Some(((_, _, value), _)) = entry.transpose()? {
|
|
|
|
docids_values.push((docid, value));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
docids_values.sort_unstable_by_key(|(_, v)| *v);
|
|
|
|
let iter = docids_values.into_iter();
|
|
|
|
let iter = if is_ascending {
|
2021-03-04 11:00:18 +01:00
|
|
|
Box::new(iter) as Box<dyn Iterator<Item = _>>
|
|
|
|
} else {
|
|
|
|
Box::new(iter.rev())
|
|
|
|
};
|
|
|
|
|
|
|
|
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore
|
|
|
|
// required to collect the result into an owned collection (a Vec).
|
|
|
|
// https://github.com/rust-itertools/itertools/issues/499
|
2021-04-28 10:20:31 +02:00
|
|
|
let vec: Vec<_> = iter
|
2021-08-23 11:37:18 +02:00
|
|
|
.group_by(|(_, v)| *v)
|
2021-03-04 11:00:18 +01:00
|
|
|
.into_iter()
|
|
|
|
.map(|(_, ids)| ids.map(|(id, _)| id).collect())
|
|
|
|
.collect();
|
|
|
|
|
|
|
|
Ok(vec.into_iter())
|
|
|
|
}
|