Reintroduce facet distribution functionality

This commit is contained in:
Loïc Lecrenier 2022-08-31 07:50:18 +02:00 committed by Loïc Lecrenier
parent b8a1caad5e
commit 5a904cf29d
3 changed files with 72 additions and 70 deletions

View File

@ -1,13 +1,18 @@
use std::collections::{BTreeMap, HashSet};
use std::ops::Bound::Unbounded;
use std::ops::ControlFlow;
use std::{fmt, mem};
use heed::types::ByteSlice;
use heed::BytesDecode;
use roaring::RoaringBitmap;
use crate::error::UserError;
use crate::facet::FacetType;
use crate::heed_codec::facet::new::ordered_f64_codec::OrderedF64Codec;
use crate::heed_codec::facet::new::str_ref::StrRefCodec;
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKeyCodec, MyByteSlice};
use crate::heed_codec::facet::{FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec};
use crate::search::facet::facet_distribution_iter;
// use crate::search::facet::FacetStringIter;
use crate::{FieldId, Index, Result};
@ -131,22 +136,21 @@ impl<'a> FacetDistribution<'a> {
candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()> {
todo!()
// let iter =
// FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
// for result in iter {
// let (value, mut docids) = result?;
// docids &= candidates;
// if !docids.is_empty() {
// distribution.insert(value.to_string(), docids.len());
// }
// if distribution.len() == self.max_values_per_facet {
// break;
// }
// }
// Ok(())
facet_distribution_iter::iterate_over_facet_distribution(
self.rtxn,
&self.index.facet_id_f64_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
field_id,
candidates,
|facet_key, nbr_docids| {
let facet_key = OrderedF64Codec::bytes_decode(facet_key).unwrap();
distribution.insert(facet_key.to_string(), nbr_docids);
if distribution.len() == self.max_values_per_facet {
ControlFlow::Break(())
} else {
ControlFlow::Continue(())
}
},
)
}
fn facet_strings_distribution_from_facet_levels(
@ -155,22 +159,21 @@ impl<'a> FacetDistribution<'a> {
candidates: &RoaringBitmap,
distribution: &mut BTreeMap<String, u64>,
) -> heed::Result<()> {
todo!()
// let iter =
// FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?;
// for result in iter {
// let (_normalized, original, mut docids) = result?;
// docids &= candidates;
// if !docids.is_empty() {
// distribution.insert(original.to_string(), docids.len());
// }
// if distribution.len() == self.max_values_per_facet {
// break;
// }
// }
// Ok(())
facet_distribution_iter::iterate_over_facet_distribution(
self.rtxn,
&self.index.facet_id_string_docids.remap_key_type::<FacetKeyCodec<MyByteSlice>>(),
field_id,
candidates,
|facet_key, nbr_docids| {
let facet_key = StrRefCodec::bytes_decode(facet_key).unwrap();
distribution.insert(facet_key.to_string(), nbr_docids);
if distribution.len() == self.max_values_per_facet {
ControlFlow::Break(())
} else {
ControlFlow::Continue(())
}
},
)
}
/// Placeholder search, a.k.a. no candidates were specified. We iterate throught the
@ -179,43 +182,42 @@ impl<'a> FacetDistribution<'a> {
&self,
field_id: FieldId,
) -> heed::Result<BTreeMap<String, u64>> {
todo!()
// let mut distribution = BTreeMap::new();
let mut distribution = BTreeMap::new();
// let db = self.index.facet_id_f64_docids;
// let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?;
let db = self.index.facet_id_f64_docids;
let mut prefix = vec![];
prefix.extend_from_slice(&field_id.to_be_bytes());
prefix.push(0);
let iter = db
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
.remap_types::<FacetKeyCodec<OrderedF64Codec>, FacetGroupValueCodec>();
// for result in range {
// let ((_, _, value, _), docids) = result?;
// distribution.insert(value.to_string(), docids.len());
// if distribution.len() == self.max_values_per_facet {
// break;
// }
// }
for result in iter {
let (key, value) = result?;
distribution.insert(key.left_bound.to_string(), value.bitmap.len());
if distribution.len() == self.max_values_per_facet {
break;
}
}
// let iter = self
// .index
// .facet_id_string_docids
// .remap_key_type::<ByteSlice>()
// .prefix_iter(self.rtxn, &field_id.to_be_bytes())?
// .remap_key_type::<FacetStringLevelZeroCodec>();
let iter = self
.index
.facet_id_string_docids
.as_polymorph()
.prefix_iter::<_, ByteSlice, ByteSlice>(self.rtxn, prefix.as_slice())?
.remap_types::<FacetKeyCodec<StrRefCodec>, FacetGroupValueCodec>();
// let mut normalized_distribution = BTreeMap::new();
// for result in iter {
// let ((_, normalized_value), group_value) = result?;
// normalized_distribution
// .insert(normalized_value, (normalized_value, group_value.bitmap.len()));
// if normalized_distribution.len() == self.max_values_per_facet {
// break;
// }
// }
// TODO: get the original value of the facet somewhere (in the documents DB?)
for result in iter {
let (key, value) = result?;
distribution.insert(key.left_bound.to_owned(), value.bitmap.len());
if distribution.len() == self.max_values_per_facet {
break;
}
}
// let iter = normalized_distribution
// .into_iter()
// .map(|(_normalized, (original, count))| (original.to_string(), count));
// distribution.extend(iter);
// Ok(distribution)
Ok(distribution)
}
fn facet_values(&self, field_id: FieldId) -> heed::Result<BTreeMap<String, u64>> {

View File

@ -1,5 +1,5 @@
use crate::heed_codec::facet::new::{FacetGroupValueCodec, FacetKey, FacetKeyCodec, MyByteSlice};
use crate::Result;
use heed::Result;
use roaring::RoaringBitmap;
use std::ops::ControlFlow;
@ -20,7 +20,7 @@ where
get_highest_level(rtxn, &db.remap_key_type::<FacetKeyCodec<MyByteSlice>>(), field_id)?;
if let Some(first_bound) = get_first_facet_value::<MyByteSlice>(rtxn, db, field_id)? {
fd.iterate(candidates, highest_level, first_bound, usize::MAX);
fd.iterate(candidates, highest_level, first_bound, usize::MAX)?;
return Ok(());
} else {
return Ok(());

View File

@ -20,7 +20,7 @@ pub(crate) fn get_first_facet_value<'t, BoundCodec>(
txn: &'t RoTxn,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
) -> crate::Result<Option<BoundCodec::DItem>>
) -> heed::Result<Option<BoundCodec::DItem>>
where
BoundCodec: BytesDecode<'t>,
{
@ -42,7 +42,7 @@ pub(crate) fn get_last_facet_value<'t, BoundCodec>(
txn: &'t RoTxn,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
) -> crate::Result<Option<BoundCodec::DItem>>
) -> heed::Result<Option<BoundCodec::DItem>>
where
BoundCodec: BytesDecode<'t>,
{
@ -65,7 +65,7 @@ pub(crate) fn get_highest_level<'t>(
txn: &'t RoTxn<'t>,
db: &'t heed::Database<FacetKeyCodec<MyByteSlice>, FacetGroupValueCodec>,
field_id: u16,
) -> crate::Result<u8> {
) -> heed::Result<u8> {
let field_id_prefix = &field_id.to_be_bytes();
Ok(db
.as_polymorph()