From 597144b0b93ff23ef7523685d5112ecc4ce799d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 28 Apr 2021 10:20:31 +0200 Subject: [PATCH] Use both number and string facet databases in the distinct system --- milli/src/search/criteria/asc_desc.rs | 57 ++++++----- milli/src/search/distinct/facet_distinct.rs | 101 +++++++++++--------- 2 files changed, 85 insertions(+), 73 deletions(-) diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index 0511ce319..9e8bebb8f 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -8,7 +8,6 @@ use ordered_float::OrderedFloat; use roaring::RoaringBitmap; use crate::facet::FacetType; -use crate::heed_codec::facet::FieldDocIdFacetF64Codec; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; use crate::search::facet::FacetIter; use crate::search::query_tree::Operation; @@ -39,8 +38,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result - { + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_name, true) } @@ -49,8 +47,7 @@ impl<'t> AscDesc<'t> { rtxn: &'t heed::RoTxn, parent: Box, field_name: String, - ) -> anyhow::Result - { + ) -> anyhow::Result { Self::new(index, rtxn, parent, field_name, false) } @@ -60,11 +57,11 @@ impl<'t> AscDesc<'t> { parent: Box, field_name: String, ascending: bool, - ) -> anyhow::Result - { + ) -> anyhow::Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let faceted_fields = index.faceted_fields(rtxn)?; - let (field_id, facet_type) = field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; + let (field_id, facet_type) = + field_id_facet_type(&fields_ids_map, &faceted_fields, &field_name)?; Ok(AscDesc { index, @@ -86,8 +83,10 @@ impl<'t> Criterion for AscDesc<'t> { #[logging_timer::time("AscDesc::{}")] fn next(&mut self, params: &mut CriterionParameters) -> anyhow::Result> { loop { - debug!("Facet {}({}) iteration", - if self.ascending { "Asc" } else { "Desc" }, self.field_name + debug!( + "Facet {}({}) iteration", + if self.ascending { "Asc" } else { "Desc" }, + self.field_name ); match self.candidates.next().transpose()? { @@ -138,7 +137,7 @@ impl<'t> Criterion for AscDesc<'t> { filtered_candidates: None, bucket_candidates: Some(take(&mut self.bucket_candidates)), })); - }, + } } } } @@ -148,14 +147,13 @@ fn field_id_facet_type( fields_ids_map: &FieldsIdsMap, faceted_fields: &HashMap, field: &str, -) -> anyhow::Result<(FieldId, FacetType)> -{ - let id = fields_ids_map.id(field).with_context(|| { - format!("field {:?} isn't registered", field) - })?; - let facet_type = faceted_fields.get(field).with_context(|| { - format!("field {:?} isn't faceted", field) - })?; +) -> anyhow::Result<(FieldId, FacetType)> { + let id = fields_ids_map + .id(field) + .with_context(|| format!("field {:?} isn't registered", field))?; + let facet_type = faceted_fields + .get(field) + .with_context(|| format!("field {:?} isn't faceted", field))?; Ok((id, *facet_type)) } @@ -170,14 +168,12 @@ fn facet_ordered<'t>( facet_type: FacetType, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result> + 't>> -{ +) -> anyhow::Result> + 't>> { match facet_type { FacetType::Number => { if candidates.len() <= CANDIDATES_THRESHOLD { - let iter = iterative_facet_ordered_iter( - index, rtxn, field_id, ascending, candidates, - )?; + let iter = + iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { let facet_fn = if ascending { @@ -188,7 +184,7 @@ fn facet_ordered<'t>( let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } - }, + } FacetType::String => bail!("criteria facet type must be a number"), } } @@ -202,14 +198,14 @@ fn iterative_facet_ordered_iter<'t>( field_id: FieldId, ascending: bool, candidates: RoaringBitmap, -) -> anyhow::Result + 't> -{ - let db = index.field_id_docid_facet_values.remap_key_type::(); +) -> anyhow::Result + 't> { let mut docids_values = Vec::with_capacity(candidates.len() as usize); for docid in candidates.iter() { let left = (field_id, docid, f64::MIN); let right = (field_id, docid, f64::MAX); - let mut iter = db.range(rtxn, &(left..=right))?; + let mut iter = index + .field_id_docid_facet_f64s + .range(rtxn, &(left..=right))?; let entry = if ascending { iter.next() } else { iter.last() }; if let Some(((_, _, value), ())) = entry.transpose()? { docids_values.push((docid, OrderedFloat(value))); @@ -226,7 +222,8 @@ fn iterative_facet_ordered_iter<'t>( // The itertools GroupBy iterator doesn't provide an owned version, we are therefore // required to collect the result into an owned collection (a Vec). // https://github.com/rust-itertools/itertools/issues/499 - let vec: Vec<_> = iter.group_by(|(_, v)| *v) + let vec: Vec<_> = iter + .group_by(|(_, v)| v.clone()) .into_iter() .map(|(_, ids)| ids.map(|(id, _)| id).collect()) .collect(); diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 3c508b25b..f3952e6f1 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,10 +1,14 @@ use std::mem::size_of; +use heed::types::ByteSlice; use roaring::RoaringBitmap; +use super::{Distinct, DocIter}; use crate::heed_codec::facet::*; use crate::{facet::FacetType, DocumentId, FieldId, Index}; -use super::{Distinct, DocIter}; + +const FID_SIZE: usize = size_of::(); +const DOCID_SIZE: usize = size_of::(); /// A distinct implementer that is backed by facets. /// @@ -48,31 +52,27 @@ pub struct FacetDistinctIter<'a> { } impl<'a> FacetDistinctIter<'a> { - fn get_facet_docids<'c, KC>(&self, key: &'c KC::EItem) -> anyhow::Result - where - KC: heed::BytesEncode<'c>, - { - let facet_docids = self - .index - .facet_field_id_value_docids - .remap_key_type::() - .get(self.txn, key)? - .expect("Corrupted data: Facet values must exist"); - Ok(facet_docids) + fn facet_string_docids(&self, key: &str) -> heed::Result> { + self.index + .facet_id_string_docids + .get(self.txn, &(self.distinct, key)) + } + + fn facet_number_docids(&self, key: f64) -> heed::Result> { + // get facet docids on level 0 + self.index + .facet_id_f64_docids + .get(self.txn, &(self.distinct, 0, key, key)) } fn distinct_string(&mut self, id: DocumentId) -> anyhow::Result<()> { - let iter = get_facet_values::( - id, - self.distinct, - self.index, - self.txn, - )?; + let iter = facet_string_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; - let key = (self.distinct, value); - let facet_docids = self.get_facet_docids::(&key)?; + let facet_docids = self + .facet_string_docids(value)? + .expect("Corrupted data: Facet values must exist"); self.excluded.union_with(&facet_docids); } @@ -82,17 +82,13 @@ impl<'a> FacetDistinctIter<'a> { } fn distinct_number(&mut self, id: DocumentId) -> anyhow::Result<()> { - let iter = get_facet_values::(id, - self.distinct, - self.index, - self.txn, - )?; + let iter = facet_number_values(id, self.distinct, self.index, self.txn)?; for item in iter { let ((_, _, value), _) = item?; - // get facet docids on level 0 - let key = (self.distinct, 0, value, value); - let facet_docids = self.get_facet_docids::(&key)?; + let facet_docids = self + .facet_number_docids(value)? + .expect("Corrupted data: Facet values must exist"); self.excluded.union_with(&facet_docids); } @@ -129,26 +125,44 @@ impl<'a> FacetDistinctIter<'a> { } } -fn get_facet_values<'a, KC>( +fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { + let mut key = [0; FID_SIZE + DOCID_SIZE]; + key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); + key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); + key +} + +fn facet_number_values<'a>( id: DocumentId, distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> anyhow::Result> -where - KC: heed::BytesDecode<'a>, -{ - const FID_SIZE: usize = size_of::(); - const DOCID_SIZE: usize = size_of::(); - - let mut key = [0; FID_SIZE + DOCID_SIZE]; - key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); - key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); +) -> anyhow::Result> { + let key = facet_values_prefix_key(distinct, id); let iter = index - .field_id_docid_facet_values + .field_id_docid_facet_f64s + .remap_key_type::() .prefix_iter(txn, &key)? - .remap_key_type::(); + .remap_key_type::(); + + Ok(iter) +} + +fn facet_string_values<'a>( + id: DocumentId, + distinct: FieldId, + index: &Index, + txn: &'a heed::RoTxn, +) -> anyhow::Result> { + let key = facet_values_prefix_key(distinct, id); + + let iter = index + .field_id_docid_facet_strings + .remap_key_type::() + .prefix_iter(txn, &key)? + .remap_key_type::(); + Ok(iter) } @@ -186,8 +200,8 @@ impl<'a> Distinct<'_> for FacetDistinct<'a> { mod test { use std::collections::HashMap; - use super::*; use super::super::test::{generate_index, validate_distinct_candidates}; + use super::*; use crate::facet::FacetType; macro_rules! test_facet_distinct { @@ -196,7 +210,8 @@ mod test { fn $name() { use std::iter::FromIterator; - let facets = HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); + let facets = + HashMap::from_iter(Some(($distinct.to_string(), $facet_type.to_string()))); let (index, fid, candidates) = generate_index($distinct, facets); let txn = index.read_txn().unwrap(); let mut map_distinct = FacetDistinct::new(fid, &index, &txn, $facet_type);