mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
Clean and make the facet order configurable internally
This commit is contained in:
parent
f42bef2f66
commit
80bbd4b6f3
@ -9,12 +9,14 @@ use roaring::RoaringBitmap;
|
|||||||
use crate::error::UserError;
|
use crate::error::UserError;
|
||||||
use crate::facet::FacetType;
|
use crate::facet::FacetType;
|
||||||
use crate::heed_codec::facet::{
|
use crate::heed_codec::facet::{
|
||||||
FacetGroupKeyCodec, FacetGroupValueCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec,
|
FacetGroupKeyCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, OrderedF64Codec,
|
||||||
OrderedF64Codec,
|
|
||||||
};
|
};
|
||||||
use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec};
|
use crate::heed_codec::{ByteSliceRefCodec, StrRefCodec};
|
||||||
use crate::search::facet::facet_distribution_iter;
|
use crate::search::facet::facet_distribution_iter;
|
||||||
use crate::{FieldId, Index, Result};
|
use crate::{FieldId, Index, Result};
|
||||||
|
use facet_distribution_iter::{
|
||||||
|
count_iterate_over_facet_distribution, lexicographically_iterate_over_facet_distribution,
|
||||||
|
};
|
||||||
|
|
||||||
/// The default number of values by facets that will
|
/// The default number of values by facets that will
|
||||||
/// be fetched from the key-value store.
|
/// be fetched from the key-value store.
|
||||||
@ -24,10 +26,20 @@ pub const DEFAULT_VALUES_PER_FACET: usize = 100;
|
|||||||
/// the system to choose between one algorithm or another.
|
/// the system to choose between one algorithm or another.
|
||||||
const CANDIDATES_THRESHOLD: u64 = 3000;
|
const CANDIDATES_THRESHOLD: u64 = 3000;
|
||||||
|
|
||||||
|
/// How should we fetch the facets?
|
||||||
|
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
|
||||||
|
pub enum OrderBy {
|
||||||
|
/// By lexicographic order...
|
||||||
|
Lexicographic,
|
||||||
|
/// Or by number of docids in common?
|
||||||
|
Count,
|
||||||
|
}
|
||||||
|
|
||||||
pub struct FacetDistribution<'a> {
|
pub struct FacetDistribution<'a> {
|
||||||
facets: Option<HashSet<String>>,
|
facets: Option<HashSet<String>>,
|
||||||
candidates: Option<RoaringBitmap>,
|
candidates: Option<RoaringBitmap>,
|
||||||
max_values_per_facet: usize,
|
max_values_per_facet: usize,
|
||||||
|
order_by: OrderBy,
|
||||||
rtxn: &'a heed::RoTxn<'a>,
|
rtxn: &'a heed::RoTxn<'a>,
|
||||||
index: &'a Index,
|
index: &'a Index,
|
||||||
}
|
}
|
||||||
@ -38,6 +50,7 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
facets: None,
|
facets: None,
|
||||||
candidates: None,
|
candidates: None,
|
||||||
max_values_per_facet: DEFAULT_VALUES_PER_FACET,
|
max_values_per_facet: DEFAULT_VALUES_PER_FACET,
|
||||||
|
order_by: OrderBy::Count,
|
||||||
rtxn,
|
rtxn,
|
||||||
index,
|
index,
|
||||||
}
|
}
|
||||||
@ -53,6 +66,11 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn order_by(&mut self, order_by: OrderBy) -> &mut Self {
|
||||||
|
self.order_by = order_by;
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
|
pub fn candidates(&mut self, candidates: RoaringBitmap) -> &mut Self {
|
||||||
self.candidates = Some(candidates);
|
self.candidates = Some(candidates);
|
||||||
self
|
self
|
||||||
@ -134,9 +152,15 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
&self,
|
&self,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
candidates: &RoaringBitmap,
|
candidates: &RoaringBitmap,
|
||||||
|
order_by: OrderBy,
|
||||||
distribution: &mut BTreeMap<String, u64>,
|
distribution: &mut BTreeMap<String, u64>,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
facet_distribution_iter::lexicographically_iterate_over_facet_distribution(
|
let search_function = match order_by {
|
||||||
|
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
|
||||||
|
OrderBy::Count => count_iterate_over_facet_distribution,
|
||||||
|
};
|
||||||
|
|
||||||
|
search_function(
|
||||||
self.rtxn,
|
self.rtxn,
|
||||||
self.index
|
self.index
|
||||||
.facet_id_f64_docids
|
.facet_id_f64_docids
|
||||||
@ -159,9 +183,15 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
&self,
|
&self,
|
||||||
field_id: FieldId,
|
field_id: FieldId,
|
||||||
candidates: &RoaringBitmap,
|
candidates: &RoaringBitmap,
|
||||||
|
order_by: OrderBy,
|
||||||
distribution: &mut BTreeMap<String, u64>,
|
distribution: &mut BTreeMap<String, u64>,
|
||||||
) -> heed::Result<()> {
|
) -> heed::Result<()> {
|
||||||
facet_distribution_iter::lexicographically_iterate_over_facet_distribution(
|
let search_function = match order_by {
|
||||||
|
OrderBy::Lexicographic => lexicographically_iterate_over_facet_distribution,
|
||||||
|
OrderBy::Count => count_iterate_over_facet_distribution,
|
||||||
|
};
|
||||||
|
|
||||||
|
search_function(
|
||||||
self.rtxn,
|
self.rtxn,
|
||||||
self.index
|
self.index
|
||||||
.facet_id_string_docids
|
.facet_id_string_docids
|
||||||
@ -189,98 +219,42 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
|
|
||||||
/// facet values one by one and iterate on the facet level 0 for numbers.
|
|
||||||
fn facet_values_from_raw_facet_database(
|
|
||||||
&self,
|
|
||||||
field_id: FieldId,
|
|
||||||
) -> heed::Result<BTreeMap<String, u64>> {
|
|
||||||
let mut distribution = BTreeMap::new();
|
|
||||||
|
|
||||||
let db = self.index.facet_id_f64_docids;
|
|
||||||
let mut prefix = vec![];
|
|
||||||
prefix.extend_from_slice(&field_id.to_be_bytes());
|
|
||||||
prefix.push(0); // read values from level 0 only
|
|
||||||
|
|
||||||
let iter = db
|
|
||||||
.as_polymorph()
|
|
||||||
.prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
|
|
||||||
.remap_types::<FacetGroupKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>();
|
|
||||||
|
|
||||||
for result in iter {
|
|
||||||
let (key, value) = result?;
|
|
||||||
distribution.insert(key.left_bound.to_string(), value.bitmap.len());
|
|
||||||
if distribution.len() == self.max_values_per_facet {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let iter = self
|
|
||||||
.index
|
|
||||||
.facet_id_string_docids
|
|
||||||
.as_polymorph()
|
|
||||||
.prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
|
|
||||||
.remap_types::<FacetGroupKeyCodec<StrRefCodec>, FacetGroupValueCodec>();
|
|
||||||
|
|
||||||
for result in iter {
|
|
||||||
let (key, value) = result?;
|
|
||||||
|
|
||||||
let docid = value.bitmap.iter().next().unwrap();
|
|
||||||
let key: (FieldId, _, &'a str) = (field_id, docid, key.left_bound);
|
|
||||||
let original_string =
|
|
||||||
self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned();
|
|
||||||
|
|
||||||
distribution.insert(original_string, value.bitmap.len());
|
|
||||||
if distribution.len() == self.max_values_per_facet {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Ok(distribution)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
|
fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {
|
||||||
// use FacetType::{Number, String};
|
use FacetType::{Number, String};
|
||||||
|
|
||||||
let candidates = match self.candidates.as_ref() {
|
|
||||||
Some(candidates) => candidates.clone(),
|
|
||||||
None => todo!("fetch candidates"),
|
|
||||||
};
|
|
||||||
|
|
||||||
let mut distribution = BTreeMap::new();
|
let mut distribution = BTreeMap::new();
|
||||||
|
match (self.order_by, &self.candidates) {
|
||||||
let number_distribution = facet_distribution_iter::count_iterate_over_facet_distribution(
|
(OrderBy::Lexicographic, Some(cnd)) if cnd.len() <= CANDIDATES_THRESHOLD => {
|
||||||
self.rtxn,
|
// Classic search, candidates were specified, we must return facet values only related
|
||||||
self.index
|
// to those candidates. We also enter here for facet strings for performance reasons.
|
||||||
.facet_id_f64_docids
|
self.facet_distribution_from_documents(field_id, Number, cnd, &mut distribution)?;
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
self.facet_distribution_from_documents(field_id, String, cnd, &mut distribution)?;
|
||||||
field_id,
|
}
|
||||||
&candidates,
|
_ => {
|
||||||
)?;
|
let universe;
|
||||||
|
let candidates;
|
||||||
for (count, facet_key, _) in number_distribution {
|
match &self.candidates {
|
||||||
let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap();
|
Some(cnd) => candidates = cnd,
|
||||||
distribution.insert(facet_key.to_string(), count);
|
None => {
|
||||||
|
universe = self.index.documents_ids(self.rtxn)?;
|
||||||
|
candidates = &universe;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let string_distribution = facet_distribution_iter::count_iterate_over_facet_distribution(
|
self.facet_numbers_distribution_from_facet_levels(
|
||||||
self.rtxn,
|
|
||||||
self.index
|
|
||||||
.facet_id_string_docids
|
|
||||||
.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
|
||||||
field_id,
|
field_id,
|
||||||
&candidates,
|
candidates,
|
||||||
|
self.order_by,
|
||||||
|
&mut distribution,
|
||||||
|
)?;
|
||||||
|
self.facet_strings_distribution_from_facet_levels(
|
||||||
|
field_id,
|
||||||
|
candidates,
|
||||||
|
self.order_by,
|
||||||
|
&mut distribution,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
for (count, facet_key, any_docid) in string_distribution {
|
|
||||||
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
|
|
||||||
|
|
||||||
let key: (FieldId, _, &str) = (field_id, any_docid, facet_key);
|
|
||||||
let original_string =
|
|
||||||
self.index.field_id_docid_facet_strings.get(self.rtxn, &key)?.unwrap().to_owned();
|
|
||||||
|
|
||||||
distribution.insert(original_string, count);
|
|
||||||
}
|
}
|
||||||
|
};
|
||||||
|
|
||||||
Ok(distribution)
|
Ok(distribution)
|
||||||
}
|
}
|
||||||
@ -381,13 +355,20 @@ impl<'a> FacetDistribution<'a> {
|
|||||||
|
|
||||||
impl fmt::Debug for FacetDistribution<'_> {
|
impl fmt::Debug for FacetDistribution<'_> {
|
||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
let FacetDistribution { facets, candidates, max_values_per_facet, rtxn: _, index: _ } =
|
let FacetDistribution {
|
||||||
self;
|
facets,
|
||||||
|
candidates,
|
||||||
|
max_values_per_facet,
|
||||||
|
order_by,
|
||||||
|
rtxn: _,
|
||||||
|
index: _,
|
||||||
|
} = self;
|
||||||
|
|
||||||
f.debug_struct("FacetDistribution")
|
f.debug_struct("FacetDistribution")
|
||||||
.field("facets", facets)
|
.field("facets", facets)
|
||||||
.field("candidates", candidates)
|
.field("candidates", candidates)
|
||||||
.field("max_values_per_facet", max_values_per_facet)
|
.field("max_values_per_facet", max_values_per_facet)
|
||||||
|
.field("order_by", order_by)
|
||||||
.finish()
|
.finish()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -46,12 +46,16 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn count_iterate_over_facet_distribution<'t>(
|
pub fn count_iterate_over_facet_distribution<'t, CB>(
|
||||||
rtxn: &'t heed::RoTxn<'t>,
|
rtxn: &'t heed::RoTxn<'t>,
|
||||||
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
db: heed::Database<FacetGroupKeyCodec<ByteSliceRefCodec>, FacetGroupValueCodec>,
|
||||||
field_id: u16,
|
field_id: u16,
|
||||||
candidates: &RoaringBitmap,
|
candidates: &RoaringBitmap,
|
||||||
) -> Result<Vec<(u64, &'t [u8], u32)>> {
|
mut callback: CB,
|
||||||
|
) -> Result<()>
|
||||||
|
where
|
||||||
|
CB: FnMut(&'t [u8], u64, DocumentId) -> Result<ControlFlow<()>>,
|
||||||
|
{
|
||||||
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
|
#[derive(Debug, PartialOrd, Ord, PartialEq, Eq)]
|
||||||
struct LevelEntry<'t> {
|
struct LevelEntry<'t> {
|
||||||
/// The number of candidates in this entry.
|
/// The number of candidates in this entry.
|
||||||
@ -68,8 +72,6 @@ pub fn count_iterate_over_facet_distribution<'t>(
|
|||||||
|
|
||||||
// Represents the list of keys that we must explore.
|
// Represents the list of keys that we must explore.
|
||||||
let mut heap = BinaryHeap::new();
|
let mut heap = BinaryHeap::new();
|
||||||
let mut results = Vec::new();
|
|
||||||
|
|
||||||
let highest_level = get_highest_level(
|
let highest_level = get_highest_level(
|
||||||
rtxn,
|
rtxn,
|
||||||
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
db.remap_key_type::<FacetGroupKeyCodec<ByteSliceRefCodec>>(),
|
||||||
@ -103,10 +105,9 @@ pub fn count_iterate_over_facet_distribution<'t>(
|
|||||||
while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
|
while let Some(LevelEntry { count, level, left_bound, group_size, any_docid }) = heap.pop()
|
||||||
{
|
{
|
||||||
if let Reverse(0) = level {
|
if let Reverse(0) = level {
|
||||||
results.push((count, left_bound, any_docid));
|
match (callback)(left_bound, count, any_docid)? {
|
||||||
// TODO better just call the user callback and ask for a ControlFlow
|
ControlFlow::Continue(_) => (),
|
||||||
if results.len() == 20 {
|
ControlFlow::Break(_) => return Ok(()),
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
let starting_key =
|
let starting_key =
|
||||||
@ -132,11 +133,9 @@ pub fn count_iterate_over_facet_distribution<'t>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(results)
|
|
||||||
} else {
|
|
||||||
Ok(Default::default())
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterate over the facets values by lexicographic order.
|
/// Iterate over the facets values by lexicographic order.
|
||||||
|
Loading…
Reference in New Issue
Block a user