Use both number and string facet databases in the distinct system

This commit is contained in:
Clément Renault 2021-04-28 10:20:31 +02:00 committed by Kerollmops
parent 837c1041c7
commit 597144b0b9
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
2 changed files with 85 additions and 73 deletions

View File

@ -8,7 +8,6 @@ use ordered_float::OrderedFloat;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use crate::facet::FacetType; use crate::facet::FacetType;
use crate::heed_codec::facet::FieldDocIdFacetF64Codec;
use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder};
use crate::search::facet::FacetIter; use crate::search::facet::FacetIter;
use crate::search::query_tree::Operation; use crate::search::query_tree::Operation;
@ -39,8 +38,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
field_name: String, field_name: String,
) -> anyhow::Result<Self> ) -> anyhow::Result<Self> {
{
Self::new(index, rtxn, parent, field_name, true) Self::new(index, rtxn, parent, field_name, true)
} }
@ -49,8 +47,7 @@ impl<'t> AscDesc<'t> {
rtxn: &'t heed::RoTxn, rtxn: &'t heed::RoTxn,
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
field_name: String, field_name: String,
) -> anyhow::Result<Self> ) -> anyhow::Result<Self> {
{
Self::new(index, rtxn, parent, field_name, false) Self::new(index, rtxn, parent, field_name, false)
} }
@ -60,11 +57,11 @@ impl<'t> AscDesc<'t> {
parent: Box<dyn Criterion + 't>, parent: Box<dyn Criterion + 't>,
field_name: String, field_name: String,
ascending: bool, ascending: bool,
) -> anyhow::Result<Self> ) -> anyhow::Result<Self> {
{
let fields_ids_map = index.fields_ids_map(rtxn)?; let fields_ids_map = index.fields_ids_map(rtxn)?;
let faceted_fields = index.faceted_fields(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?;
let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; let (field_id, facet_type) =
field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?;
Ok(AscDesc { Ok(AscDesc {
index, index,
@ -86,8 +83,10 @@ impl<'t> Criterion for AscDesc<'t> {
#[logging_timer::time("AscDesc::{}")] #[logging_timer::time("AscDesc::{}")]
fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> { fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result<Option<CriterionResult>> {
loop { loop {
debug!("Facet {}({}) iteration", debug!(
if self.ascending { "Asc" } else { "Desc" }, self.field_name "Facet {}({}) iteration",
if self.ascending { "Asc" } else { "Desc" },
self.field_name
); );
match self.candidates.next().transpose()? { match self.candidates.next().transpose()? {
@ -138,7 +137,7 @@ impl<'t> Criterion for AscDesc<'t> {
filtered_candidates: None, filtered_candidates: None,
bucket_candidates: Some(take(&mut self.bucket_candidates)), bucket_candidates: Some(take(&mut self.bucket_candidates)),
})); }));
}, }
} }
} }
} }
@ -148,14 +147,13 @@ fn field_id_facet_type(
fields_ids_map: &FieldsIdsMap, fields_ids_map: &FieldsIdsMap,
faceted_fields: &HashMap<String, FacetType>, faceted_fields: &HashMap<String, FacetType>,
field: &str, field: &str,
) -> anyhow::Result<(FieldId, FacetType)> ) -> anyhow::Result<(FieldId, FacetType)> {
{ let id = fields_ids_map
let id = fields_ids_map.id(field).with_context(|| { .id(field)
format!("field {:?} isn't registered", field) .with_context(|| format!("field {:?} isn't registered", field))?;
})?; let facet_type = faceted_fields
let facet_type = faceted_fields.get(field).with_context(|| { .get(field)
format!("field {:?} isn't faceted", field) .with_context(|| format!("field {:?} isn't faceted", field))?;
})?;
Ok((id, *facet_type)) Ok((id, *facet_type))
} }
@ -170,14 +168,12 @@ fn facet_ordered<'t>(
facet_type: FacetType, facet_type: FacetType,
ascending: bool, ascending: bool,
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> ) -> anyhow::Result<Box<dyn Iterator<Item = heed::Result<RoaringBitmap>> + 't>> {
{
match facet_type { match facet_type {
FacetType::Number => { FacetType::Number => {
if candidates.len() <= CANDIDATES_THRESHOLD { if candidates.len() <= CANDIDATES_THRESHOLD {
let iter = iterative_facet_ordered_iter( let iter =
index, rtxn, field_id, ascending, candidates, iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?;
)?;
Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>) Ok(Box::new(iter.map(Ok)) as Box<dyn Iterator<Item = _>>)
} else { } else {
let facet_fn = if ascending { let facet_fn = if ascending {
@ -188,7 +184,7 @@ fn facet_ordered<'t>(
let iter = facet_fn(rtxn, index, field_id, candidates)?; let iter = facet_fn(rtxn, index, field_id, candidates)?;
Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids))))
} }
}, }
FacetType::String => bail!("criteria facet type must be a number"), FacetType::String => bail!("criteria facet type must be a number"),
} }
} }
@ -202,14 +198,14 @@ fn iterative_facet_ordered_iter<'t>(
field_id: FieldId, field_id: FieldId,
ascending: bool, ascending: bool,
candidates: RoaringBitmap, candidates: RoaringBitmap,
) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't> ) -> anyhow::Result<impl Iterator<Item = RoaringBitmap> + 't> {
{
let db = index.field_id_docid_facet_values.remap_key_type::<FieldDocIdFacetF64Codec>();
let mut docids_values = Vec::with_capacity(candidates.len() as usize); let mut docids_values = Vec::with_capacity(candidates.len() as usize);
for docid in candidates.iter() { for docid in candidates.iter() {
let left = (field_id, docid, f64::MIN); let left = (field_id, docid, f64::MIN);
let right = (field_id, docid, f64::MAX); let right = (field_id, docid, f64::MAX);
let mut iter = db.range(rtxn, &(left..=right))?; let mut iter = index
.field_id_docid_facet_f64s
.range(rtxn, &(left..=right))?;
let entry = if ascending { iter.next() } else { iter.last() }; let entry = if ascending { iter.next() } else { iter.last() };
if let Some(((_, _, value), ())) = entry.transpose()? { if let Some(((_, _, value), ())) = entry.transpose()? {
docids_values.push((docid, OrderedFloat(value))); docids_values.push((docid, OrderedFloat(value)));
@ -226,7 +222,8 @@ fn iterative_facet_ordered_iter<'t>(
// The itertools GroupBy iterator doesn't provide an owned version, we are therefore // The itertools GroupBy iterator doesn't provide an owned version, we are therefore
// required to collect the result into an owned collection (a Vec). // required to collect the result into an owned collection (a Vec).
// https://github.com/rust-itertools/itertools/issues/499 // https://github.com/rust-itertools/itertools/issues/499
let vec: Vec<_> = iter.group_by(|(_, v)| *v) let vec: Vec<_> = iter
.group_by(|(_, v)| v.clone())
.into_iter() .into_iter()
.map(|(_, ids)| ids.map(|(id, _)| id).collect()) .map(|(_, ids)| ids.map(|(id, _)| id).collect())
.collect(); .collect();

View File

@ -1,10 +1,14 @@
use std::mem::size_of; use std::mem::size_of;
use heed::types::ByteSlice;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::{Distinct, DocIter};
use crate::heed_codec::facet::*; use crate::heed_codec::facet::*;
use crate::{facet::FacetType, DocumentId, FieldId, Index}; use crate::{facet::FacetType, DocumentId, FieldId, Index};
use super::{Distinct, DocIter};
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
/// A distinct implementer that is backed by facets. /// A distinct implementer that is backed by facets.
/// ///
@ -48,31 +52,27 @@ pub struct FacetDistinctIter<'a> {
} }
impl<'a> FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> {
fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result<RoaringBitmap> fn facet_string_docids(&self, key: &str) -> heed::Result<Option<RoaringBitmap>> {
where self.index
KC: heed::BytesEncode<'c>, .facet_id_string_docids
{ .get(self.txn, &(self.distinct, key))
let facet_docids = self }
.index
.facet_field_id_value_docids fn facet_number_docids(&self, key: f64) -> heed::Result<Option<RoaringBitmap>> {
.remap_key_type::<KC>() // get facet docids on level 0
.get(self.txn, key)? self.index
.expect("Corrupted data: Facet values must exist"); .facet_id_f64_docids
Ok(facet_docids) .get(self.txn, &(self.distinct, 0, key, key))
} }
fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetStringCodec>( let iter = facet_string_values(id, self.distinct, self.index, self.txn)?;
id,
self.distinct,
self.index,
self.txn,
)?;
for item in iter { for item in iter {
let ((_, _, value), _) = item?; let ((_, _, value), _) = item?;
let key = (self.distinct, value); let facet_docids = self
let facet_docids = self.get_facet_docids::<FacetValueStringCodec>(&key)?; .facet_string_docids(value)?
.expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids); self.excluded.union_with(&facet_docids);
} }
@ -82,17 +82,13 @@ impl<'a> FacetDistinctIter<'a> {
} }
fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> {
let iter = get_facet_values::<FieldDocIdFacetF64Codec>(id, let iter = facet_number_values(id, self.distinct, self.index, self.txn)?;
self.distinct,
self.index,
self.txn,
)?;
for item in iter { for item in iter {
let ((_, _, value), _) = item?; let ((_, _, value), _) = item?;
// get facet docids on level 0 let facet_docids = self
let key = (self.distinct, 0, value, value); .facet_number_docids(value)?
let facet_docids = self.get_facet_docids::<FacetLevelValueF64Codec>(&key)?; .expect("Corrupted data: Facet values must exist");
self.excluded.union_with(&facet_docids); self.excluded.union_with(&facet_docids);
} }
@ -129,26 +125,44 @@ impl<'a> FacetDistinctIter<'a> {
} }
} }
fn get_facet_values<'a, KC>( fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] {
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
key
}
fn facet_number_values<'a>(
id: DocumentId, id: DocumentId,
distinct: FieldId, distinct: FieldId,
index: &Index, index: &Index,
txn: &'a heed::RoTxn, txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, KC, heed::types::Unit>> ) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetF64Codec, heed::types::Unit>> {
where let key = facet_values_prefix_key(distinct, id);
KC: heed::BytesDecode<'a>,
{
const FID_SIZE: usize = size_of::<FieldId>();
const DOCID_SIZE: usize = size_of::<DocumentId>();
let mut key = [0; FID_SIZE + DOCID_SIZE];
key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes());
key[FID_SIZE..].copy_from_slice(&id.to_be_bytes());
let iter = index let iter = index
.field_id_docid_facet_values .field_id_docid_facet_f64s
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)? .prefix_iter(txn, &key)?
.remap_key_type::<KC>(); .remap_key_type::<FieldDocIdFacetF64Codec>();
Ok(iter)
}
fn facet_string_values<'a>(
id: DocumentId,
distinct: FieldId,
index: &Index,
txn: &'a heed::RoTxn,
) -> anyhow::Result<heed::RoPrefix<'a, FieldDocIdFacetStringCodec, heed::types::Unit>> {
let key = facet_values_prefix_key(distinct, id);
let iter = index
.field_id_docid_facet_strings
.remap_key_type::<ByteSlice>()
.prefix_iter(txn, &key)?
.remap_key_type::<FieldDocIdFacetStringCodec>();
Ok(iter) Ok(iter)
} }
@ -186,8 +200,8 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> {
mod test { mod test {
use std::collections::HashMap; use std::collections::HashMap;
use super::*;
use super::super::test::{generate_index, validate_distinct_candidates}; use super::super::test::{generate_index, validate_distinct_candidates};
use super::*;
use crate::facet::FacetType; use crate::facet::FacetType;
macro_rules! test_facet_distinct { macro_rules! test_facet_distinct {
@ -196,7 +210,8 @@ mod test {
fn $name() { fn $name() {
use std::iter::FromIterator; use std::iter::FromIterator;
let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); let facets =
HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string())));
let (index, fid, candidates) = generate_index($distinct, facets); let (index, fid, candidates) = generate_index($distinct, facets);
let txn = index.read_txn().unwrap(); let txn = index.read_txn().unwrap();
let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type); let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type);