Clean and make the facet order configurable internally

This commit is contained in:
Kerollmops 2023-05-29 13:39:06 +02:00 committed by Clément Renault
parent f42bef2f66
commit 80bbd4b6f3
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 86 additions and 106 deletions

View File

@ -9,12 +9,14 @@ use roaring::RoaringBitmap;
use crate::error::UserError;
use crate::facet::FacetType;
use crate::heed_codec::facet::{
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
OrderedF64Codec,
FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec,
};
use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec};
use crate::search::facet::facet_distribution_iter;
use crate::{FieldId, Index, Result};
use facet_distribution_iter::{
count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution,
};
/// The default number of values by facets that will
/// be fetched from the key-value store.
@ -24,10 +26,20 @@ pub const DEFAULT_VALUES_PER_FACET: usize = 100;
/// the system to choose between one algorithm or another.
const CANDIDATES_THRESHOLD: u64 = 3000;
/// How should we fetch the facets?
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum OrderBy {
/// By lexicographic order...
Lexicographic,
/// Or by number of docids in common?
Count,
}
pub struct FacetDistribution<'a> {
facets: Option<HashSet<String>>,
candidates: Option<RoaringBitmap>,
max_values_per_facet: usize,
order_by: OrderBy,
rtxn: &'a heed::RoTxn<'a>,
index: &'a Index,
}
@ -38,6 +50,7 @@ impl<'a> FacetDistribution<'a> {
facets: None,
candidates: None,
max_values_per_facet: DEFAULT_VALUES_PER_FACET,
order_by: OrderBy::Count,
rtxn,
index,
}
@ -53,6 +66,11 @@ impl<'a> FacetDistribution<'a> {
self
}
pub fn order_by(&mut self, order_by: OrderBy) -> &mut Self {
self.order_by = order_by;
self
}
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
self.candidates = Some(candidates);
self
@ -134,9 +152,15 @@ impl<'a> FacetDistribution<'a> {
&self,
field_id: FieldId,
candidates: &RoaringBitmap,
order_by: OrderBy,
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()> {
facet_distribution_iter::lexicographically_iterate_over_facet_distribution(
let search_function = match order_by {
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
OrderBy::Count => count_iterate_over_facet_distribution,
};
search_function(
self.rtxn,
self.index
.facet_id_f64_docids
@ -159,9 +183,15 @@ impl<'a> FacetDistribution<'a> {
&self,
field_id: FieldId,
candidates: &RoaringBitmap,
order_by: OrderBy,
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()> {
facet_distribution_iter::lexicographically_iterate_over_facet_distribution(
let search_function = match order_by {
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
OrderBy::Count => count_iterate_over_facet_distribution,
};
search_function(
self.rtxn,
self.index
.facet_id_string_docids
@ -189,98 +219,42 @@ impl<'a> FacetDistribution<'a> {
)
}
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
/// facet values one by one and iterate on the facet level 0 for numbers.
fn facet_values_from_raw_facet_database(
&self,
field_id: FieldId,
) -> heed::Result<BTreeMap<String, u64>> {
let mut distribution = BTreeMap::new();
let db = self.index.facet_id_f64_docids;
let mut prefix = vec![];
prefix.extend_from_slice(&field_id.to_be_bytes());
prefix.push(0); // read values from level 0 only
let iter = db
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
.remap_types::<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>();
for result in iter {
let (key, value) = result?;
distribution.insert(key.left_bound.to_string(), value.bitmap.len());
if distribution.len() == self.max_values_per_facet {
break;
}
}
let iter = self
.index
.facet_id_string_docids
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
.remap_types::<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>();
for result in iter {
let (key, value) = result?;
let docid = value.bitmap.iter().next().unwrap();
let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound);
let original_string =
self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned();
distribution.insert(original_string, value.bitmap.len());
if distribution.len() == self.max_values_per_facet {
break;
}
}
Ok(distribution)
}
fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
// use FacetType::{Number, String};
let candidates = match self.candidates.as_ref() {
Some(candidates) => candidates.clone(),
None => todo!("fetch candidates"),
};
use FacetType::{Number, String};
let mut distribution = BTreeMap::new();
let number_distribution = facet_distribution_iter::count_iterate_over_facet_distribution(
self.rtxn,
self.index
.facet_id_f64_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
field_id,
&candidates,
)?;
for (count, facet_key, _) in number_distribution {
let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap();
distribution.insert(facet_key.to_string(), count);
match (self.order_by, &self.candidates) {
(OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => {
// Classic search, candidates were specified, we must return facet values only related
// to those candidates. We also enter here for facet strings for performance reasons.
self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?;
self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?;
}
_ => {
let universe;
let candidates;
match &self.candidates {
Some(cnd) => candidates = cnd,
None => {
universe = self.index.documents_ids(self.rtxn)?;
candidates = &universe;
}
}
let string_distribution = facet_distribution_iter::count_iterate_over_facet_distribution(
self.rtxn,
self.index
.facet_id_string_docids
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
self.facet_numbers_distribution_from_facet_levels(
field_id,
&candidates,
candidates,
self.order_by,
&mut distribution,
)?;
self.facet_strings_distribution_from_facet_levels(
field_id,
candidates,
self.order_by,
&mut distribution,
)?;
for (count, facet_key, any_docid) in string_distribution {
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
let original_string =
self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned();
distribution.insert(original_string, count);
}
};
Ok(distribution)
}
@ -381,13 +355,20 @@ impl<'a> FacetDistribution<'a> {
impl fmt::Debug for FacetDistribution<'_> {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let FacetDistribution { facets, candidates, max_values_per_facet, rtxn: _, index: _ } =
self;
let FacetDistribution {
facets,
candidates,
max_values_per_facet,
order_by,
rtxn: _,
index: _,
} = self;
f.debug_struct("FacetDistribution")
.field("facets", facets)
.field("candidates", candidates)
.field("max_values_per_facet", max_values_per_facet)
.field("order_by", order_by)
.finish()
}
}

View File

@ -46,12 +46,16 @@ where
}
}
pub fn count_iterate_over_facet_distribution<'t>(
pub fn count_iterate_over_facet_distribution<'t, CB>(
rtxn: &'t heed::RoTxn<'t>,
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
field_id: u16,
candidates: &RoaringBitmap,
) -> Result<Vec<(u64, &'t [u8], u32)>> {
mut callback: CB,
) -> Result<()>
where
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
{
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
struct LevelEntry<'t> {
/// The number of candidates in this entry.
@ -68,8 +72,6 @@ pub fn count_iterate_over_facet_distribution<'t>(
// Represents the list of keys that we must explore.
let mut heap = BinaryHeap::new();
let mut results = Vec::new();
let highest_level = get_highest_level(
rtxn,
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
@ -103,10 +105,9 @@ pub fn count_iterate_over_facet_distribution<'t>(
while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
{
if let Reverse(0) = level {
results.push((count, left_bound, any_docid));
// TODO better just call the user callback and ask for a ControlFlow
if results.len() == 20 {
break;
match (callback)(left_bound, count, any_docid)? {
ControlFlow::Continue(_) => (),
ControlFlow::Break(_) => return Ok(()),
}
} else {
let starting_key =
@ -132,11 +133,9 @@ pub fn count_iterate_over_facet_distribution<'t>(
}
}
}
Ok(results)
} else {
Ok(Default::default())
}
Ok(())
}
/// Iterate over the facets values by lexicographic order.