From f858f64b1f0f69d791d3ba38b52ea1d002faacad Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 10:29:00 +0200 Subject: [PATCH 01/12] Move the facet number iterators into their own module --- milli/src/search/criteria/asc_desc.rs | 9 +- milli/src/search/facet/facet_distribution.rs | 6 +- milli/src/search/facet/facet_number.rs | 248 +++++++++++++++++++ milli/src/search/facet/filter_condition.rs | 4 +- milli/src/search/facet/mod.rs | 248 +------------------ milli/src/search/mod.rs | 2 +- 6 files changed, 262 insertions(+), 255 deletions(-) create mode 100644 milli/src/search/facet/facet_number.rs diff --git a/milli/src/search/criteria/asc_desc.rs b/milli/src/search/criteria/asc_desc.rs index ccee2c393..99d63c90d 100644 --- a/milli/src/search/criteria/asc_desc.rs +++ b/milli/src/search/criteria/asc_desc.rs @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use super::{Criterion, CriterionParameters, CriterionResult}; use crate::error::FieldIdMapMissingEntry; use crate::search::criteria::{resolve_query_tree, CriteriaBuilder}; -use crate::search::facet::FacetIter; +use crate::search::facet::FacetNumberIter; use crate::search::query_tree::Operation; use crate::{FieldId, Index, Result}; @@ -172,8 +172,11 @@ fn facet_ordered<'t>( let iter = iterative_facet_ordered_iter(index, rtxn, field_id, ascending, candidates)?; Ok(Box::new(iter.map(Ok)) as Box>) } else { - let facet_fn = - if ascending { FacetIter::new_reducing } else { FacetIter::new_reverse_reducing }; + let facet_fn = if ascending { + FacetNumberIter::new_reducing + } else { + FacetNumberIter::new_reverse_reducing + }; let iter = facet_fn(rtxn, index, field_id, candidates)?; Ok(Box::new(iter.map(|res| res.map(|(_, docids)| docids)))) } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index b0b22ac49..080fd9af7 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -9,7 +9,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; use crate::heed_codec::facet::FacetValueStringCodec; -use crate::search::facet::{FacetIter, FacetRange}; +use crate::search::facet::{FacetNumberIter, FacetNumberRange}; use crate::{DocumentId, FieldId, Index, Result}; /// The default number of values by facets that will @@ -118,7 +118,7 @@ impl<'a> FacetDistribution<'a> { distribution: &mut BTreeMap, ) -> heed::Result<()> { let iter = - FacetIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + FacetNumberIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { let (value, mut docids) = result?; @@ -143,7 +143,7 @@ impl<'a> FacetDistribution<'a> { let mut distribution = BTreeMap::new(); let db = self.index.facet_id_f64_docids; - let range = FacetRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; + let range = FacetNumberRange::new(self.rtxn, db, field_id, 0, Unbounded, Unbounded)?; for result in range { let ((_, _, value, _), docids) = result?; diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs new file mode 100644 index 000000000..f943b96da --- /dev/null +++ b/milli/src/search/facet/facet_number.rs @@ -0,0 +1,248 @@ +use std::ops::Bound::{self, Excluded, Included, Unbounded}; + +use either::Either::{self, Left, Right}; +use heed::types::{ByteSlice, DecodeIgnore}; +use heed::{Database, LazyDecode, RoRange, RoRevRange}; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::{FieldId, Index}; + +pub struct FacetNumberRange<'t> { + iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, +} + +impl<'t> FacetNumberRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), + }; + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetNumberRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetNumberRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => Some(Ok(((fid, level, left, right), docids))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +pub struct FacetNumberRevRange<'t> { + iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, + end: Bound, +} + +impl<'t> FacetNumberRevRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + level: u8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, f64::MIN)), + Excluded(left) => Excluded((field_id, level, left, f64::MIN)), + Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), + }; + let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); + let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; + Ok(FacetNumberRevRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetNumberRevRange<'t> { + type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; + + fn next(&mut self) -> Option { + loop { + match self.iter.next() { + Some(Ok(((fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), + Err(e) => return Some(Err(e)), + } + } + continue; + } + Some(Err(e)) => return Some(Err(e)), + None => return None, + } + } + } +} + +pub struct FacetNumberIter<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: Database, + field_id: FieldId, + level_iters: Vec<(RoaringBitmap, Either, FacetNumberRevRange<'t>>)>, + must_reduce: bool, +} + +impl<'t> FacetNumberIter<'t> { + /// Create a `FacetNumberIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetNumberIter` that will iterate on the different facet entries in reverse + /// (facet value + documents ids) and that will reduce the given documents ids + /// while iterating on the different facet levels. + pub fn new_reverse_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Right(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: true }) + } + + /// Create a `FacetNumberIter` that will iterate on the different facet entries + /// (facet value + documents ids) and that will not reduce the given documents ids + /// while iterating on the different facet levels, possibly returning multiple times + /// a document id associated with multiple facet values. + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + let db = index.facet_id_f64_docids.remap_key_type::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = + FacetNumberRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; + let level_iters = vec![(documents_ids, Left(highest_iter))]; + Ok(FacetNumberIter { rtxn, db, field_id, level_iters, must_reduce: false }) + } + + fn highest_level( + rtxn: &'t heed::RoTxn, + db: Database, + fid: FieldId, + ) -> heed::Result> { + let level = db + .remap_types::() + .prefix_iter(rtxn, &fid.to_be_bytes())? + .remap_key_type::() + .last() + .transpose()? + .map(|((_, level, _, _), _)| level); + Ok(level) + } +} + +impl<'t> Iterator for FacetNumberIter<'t> { + type Item = heed::Result<(f64, RoaringBitmap)>; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + let is_ascending = last.is_left(); + for result in last { + // If the last iterator must find an empty set of documents it means + // that we found all the documents in the sub level iterations already, + // we can pop this level iterator. + if documents_ids.is_empty() { + break; + } + + match result { + Ok(((_fid, level, left, right), mut docids)) => { + docids &= &*documents_ids; + if !docids.is_empty() { + if self.must_reduce { + *documents_ids -= &docids; + } + + if level == 0 { + return Some(Ok((left, docids))); + } + + let rtxn = self.rtxn; + let db = self.db; + let fid = self.field_id; + let left = Included(left); + let right = Included(right); + + let result = if is_ascending { + FacetNumberRange::new(rtxn, db, fid, level - 1, left, right) + .map(Left) + } else { + FacetNumberRevRange::new(rtxn, db, fid, level - 1, left, right) + .map(Right) + }; + + match result { + Ok(iter) => { + self.level_iters.push((docids, iter)); + continue 'outer; + } + Err(e) => return Some(Err(e)), + } + } + } + Err(e) => return Some(Err(e)), + } + } + self.level_iters.pop(); + } + } +} diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 1b1eafcab..875fe3b27 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -15,7 +15,7 @@ use roaring::RoaringBitmap; use self::FilterCondition::*; use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; -use super::FacetRange; +use super::FacetNumberRange; use crate::error::UserError; use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; @@ -282,7 +282,7 @@ impl FilterCondition { // We must create a custom iterator to be able to iterate over the // requested range as the range iterator cannot express some conditions. - let iter = FacetRange::new(rtxn, db, field_id, level, left, right)?; + let iter = FacetNumberRange::new(rtxn, db, field_id, level, left, right)?; debug!("Iterating between {:?} and {:?} (level {})", left, right, level); diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 9774bdd52..e6ea92543 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,253 +1,9 @@ -use std::ops::Bound::{self, Excluded, Included, Unbounded}; - -use either::Either::{self, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore}; -use heed::{Database, LazyDecode, RoRange, RoRevRange}; -use roaring::RoaringBitmap; - pub use self::facet_distribution::FacetDistribution; +pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::filter_condition::{FilterCondition, Operator}; pub(crate) use self::parser::Rule as ParserRule; -use crate::heed_codec::facet::FacetLevelValueF64Codec; -use crate::heed_codec::CboRoaringBitmapCodec; -use crate::{FieldId, Index}; mod facet_distribution; +mod facet_number; mod filter_condition; mod parser; - -pub struct FacetRange<'t> { - iter: RoRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; - Ok(FacetRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => Some(Ok(((fid, level, left, right), docids))), - Err(e) => Some(Err(e)), - } - } else { - None - } - } - Some(Err(e)) => Some(Err(e)), - None => None, - } - } -} - -pub struct FacetRevRange<'t> { - iter: RoRevRange<'t, FacetLevelValueF64Codec, LazyDecode>, - end: Bound, -} - -impl<'t> FacetRevRange<'t> { - pub fn new( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - level: u8, - left: Bound, - right: Bound, - ) -> heed::Result> { - let left_bound = match left { - Included(left) => Included((field_id, level, left, f64::MIN)), - Excluded(left) => Excluded((field_id, level, left, f64::MIN)), - Unbounded => Included((field_id, level, f64::MIN, f64::MIN)), - }; - let right_bound = Included((field_id, level, f64::MAX, f64::MAX)); - let iter = db.lazily_decode_data().rev_range(rtxn, &(left_bound, right_bound))?; - Ok(FacetRevRange { iter, end: right }) - } -} - -impl<'t> Iterator for FacetRevRange<'t> { - type Item = heed::Result<((FieldId, u8, f64, f64), RoaringBitmap)>; - - fn next(&mut self) -> Option { - loop { - match self.iter.next() { - Some(Ok(((fid, level, left, right), docids))) => { - let must_be_returned = match self.end { - Included(end) => right <= end, - Excluded(end) => right < end, - Unbounded => true, - }; - if must_be_returned { - match docids.decode() { - Ok(docids) => return Some(Ok(((fid, level, left, right), docids))), - Err(e) => return Some(Err(e)), - } - } - continue; - } - Some(Err(e)) => return Some(Err(e)), - None => return None, - } - } - } -} - -pub struct FacetIter<'t> { - rtxn: &'t heed::RoTxn<'t>, - db: Database, - field_id: FieldId, - level_iters: Vec<(RoaringBitmap, Either, FacetRevRange<'t>>)>, - must_reduce: bool, -} - -impl<'t> FacetIter<'t> { - /// Create a `FacetIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetIter` that will iterate on the different facet entries in reverse - /// (facet value + documents ids) and that will reduce the given documents ids - /// while iterating on the different facet levels. - pub fn new_reverse_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Right(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: true }) - } - - /// Create a `FacetIter` that will iterate on the different facet entries - /// (facet value + documents ids) and that will not reduce the given documents ids - /// while iterating on the different facet levels, possibly returning multiple times - /// a document id associated with multiple facet values. - pub fn new_non_reducing( - rtxn: &'t heed::RoTxn, - index: &'t Index, - field_id: FieldId, - documents_ids: RoaringBitmap, - ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); - let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); - let highest_iter = - FacetRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; - let level_iters = vec![(documents_ids, Left(highest_iter))]; - Ok(FacetIter { rtxn, db, field_id, level_iters, must_reduce: false }) - } - - fn highest_level( - rtxn: &'t heed::RoTxn, - db: Database, - fid: FieldId, - ) -> heed::Result> { - let level = db - .remap_types::() - .prefix_iter(rtxn, &fid.to_be_bytes())? - .remap_key_type::() - .last() - .transpose()? - .map(|((_, level, _, _), _)| level); - Ok(level) - } -} - -impl<'t> Iterator for FacetIter<'t> { - type Item = heed::Result<(f64, RoaringBitmap)>; - - fn next(&mut self) -> Option { - 'outer: loop { - let (documents_ids, last) = self.level_iters.last_mut()?; - let is_ascending = last.is_left(); - for result in last { - // If the last iterator must find an empty set of documents it means - // that we found all the documents in the sub level iterations already, - // we can pop this level iterator. - if documents_ids.is_empty() { - break; - } - - match result { - Ok(((_fid, level, left, right), mut docids)) => { - docids &= &*documents_ids; - if !docids.is_empty() { - if self.must_reduce { - *documents_ids -= &docids; - } - - if level == 0 { - return Some(Ok((left, docids))); - } - - let rtxn = self.rtxn; - let db = self.db; - let fid = self.field_id; - let left = Included(left); - let right = Included(right); - - let result = if is_ascending { - FacetRange::new(rtxn, db, fid, level - 1, left, right).map(Left) - } else { - FacetRevRange::new(rtxn, db, fid, level - 1, left, right).map(Right) - }; - - match result { - Ok(iter) => { - self.level_iters.push((docids, iter)); - continue 'outer; - } - Err(e) => return Some(Err(e)), - } - } - } - Err(e) => return Some(Err(e)), - } - } - self.level_iters.pop(); - } - } -} diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index f40a6aed6..574459547 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -15,7 +15,7 @@ use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; pub(crate) use self::facet::ParserRule; -pub use self::facet::{FacetDistribution, FacetIter, FilterCondition, Operator}; +pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::FieldIdMapMissingEntry; From 851f9790393daf9693aa2c60c66149b1c17ad0a7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 11:27:17 +0200 Subject: [PATCH 02/12] Describe the way we want to group the facet strings --- milli/src/search/facet/facet_string.rs | 123 +++++++++++++++++++++++++ milli/src/search/facet/mod.rs | 1 + 2 files changed, 124 insertions(+) create mode 100644 milli/src/search/facet/facet_string.rs diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs new file mode 100644 index 000000000..61fc32f8e --- /dev/null +++ b/milli/src/search/facet/facet_string.rs @@ -0,0 +1,123 @@ +//! This module contains helpers iterators for facet strings. +//! +//! The purpose is to help iterate over the quite complex system of facets strings. A simple +//! description of the system would be that every facet string value is stored into an LMDB database +//! and that every value is associated with the document ids which are associated with this facet +//! string value. +//! +//! In reality it is a little bit more complex as we have to create aggregations of runs of facet +//! string values, those aggregations helps in choosing the right groups of facets to follow. +//! +//! ## A typical algorithm run +//! +//! If a group of aggregated facets values contains one of the documents ids, we must continue +//! iterating over the sub-groups. +//! +//! If this group is the lowest level and contain at least one document id we yield the associated +//! facet documents ids. +//! +//! If the group doesn't contain one of our documents ids, we continue to the next group at this +//! same level. +//! +//! ## The complexity comes from the strings +//! +//! This algorithm is exactly the one that we use for facet numbers. It is quite easy to create +//! aggregated facet number, groups of facets are easy to define in the LMDB key, we just put the +//! two numbers bounds, the left and the right bound of the group, both inclusive. +//! +//! It is easy to make sure that the groups are ordered, LMDB sort its keys lexicographically and +//! puting two numbers big-endian encoded one after the other gives us ordered groups. The values +//! are simple unions of the documents ids coming from the groups below. +//! +//! ### Example of what a facet number LMDB database contain +//! +//! | level | left-bound | right-bound | docs | +//! |-------|------------|-------------|------------------| +//! | 0 | 0 | _skipped_ | 1, 2 | +//! | 0 | 1 | _skipped_ | 6, 7 | +//! | 0 | 3 | _skipped_ | 4, 7 | +//! | 0 | 5 | _skipped_ | 2, 3, 4 | +//! | 1 | 0 | 1 | 1, 2, 6, 7 | +//! | 1 | 3 | 5 | 2, 3, 4, 7 | +//! | 2 | 0 | 5 | 1, 2, 3, 4, 6, 7 | +//! +//! As you can see the level 0 have two equal bounds, therefore we skip serializing the second +//! bound, that's the base level where you can directly fetch the documents ids associated with an +//! exact number. +//! +//! The next levels have two different bounds and the associated documents ids are simply the result +//! of an union of all the documents ids associated with the aggregated groups above. +//! +//! ## The complexity of defining groups of facet strings +//! +//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in +//! lexicographical order, it means that whatever the key represent the bytes are read in their raw +//! form and a simple `strcmp` will define the order in which keys will be read from the store. +//! +//! That's easy for types with a known size, like floats or integers, they are 64 bytes long and +//! appending one after the other in big-endian is consistent. LMDB will simply sort the keys by the +//! first number then by the second if the the first number is equal on two keys. +//! +//! For strings it is a lot more complex as those types are unsized, it means that the size of facet +//! strings is different for each facet value. +//! +//! ### Basic approach: padding the keys +//! +//! A first approach would be to simply define the maximum size of a facet string and pad the keys +//! with zeroes. The big problem of this approach is that it: +//! 1. reduces the maximum size of facet strings by half, as we need to put two keys one after the +//! other. +//! 2. makes the keys of facet strings very big (approximately 250 bytes), impacting a lot LMDB +//! performances. +//! +//! ### Better approach: number the facet groups +//! +//! A better approach would be to number the groups, this way we don't have the downsides of the +//! previously described approach but we need to be able to describe the groups by using a number. +//! +//! #### Example of facet strings with numbered groups +//! +//! | level | left-bound | right-bound | left-string | right-string | docs | +//! |-------|------------|-------------|-------------|--------------|------------------| +//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | +//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | +//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | +//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | +//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | +//! | 1 | 3 | 5 | gamma | omega | 2, 3, 4, 7 | +//! | 2 | 0 | 5 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +//! +//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not +//! need to store the facet string value two times. +//! +//! In the value, not in the key, you can see that we added two new values: +//! the left-string and the right-string, which defines the original facet strings associated with +//! the given group. +//! +//! We put those two strings inside of the value, this way we do not limit the maximum size of the +//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big +//! values on another page, this helps in iterating over keys fast enough and only fetch the page +//! with the values when required. +//! +//! The other little advantage with this solution is that there is no a big overhead, compared with +//! the facet number levels, we only duplicate the facet strings once for the level 1. +//! +//! #### A typical algorithm run +//! +//! Note that the algorithm is always moving from the highest level to the lowest one, one level +//! by one level, this is why it is ok to only store the facets string on the level 1. +//! +//! If a group of aggregated facets values, a group with numbers contains one of the documents ids, +//! we must continue iterating over the sub-groups. To do so: +//! - If we are at a level >= 2, we just do the same as with the facet numbers, get both bounds +//! and iterate over the facet groups defined by these numbers over the current level - 1. +//! - If we are at level 1, we retrieve both keys, the left-string and right-string, from the +//! value and just do the same as with the facet numbers but with strings: iterate over the +//! current level - 1 with both keys. +//! +//! If this group is the lowest level (level 0) and contain at least one document id we yield the +//! associated facet documents ids. +//! +//! If the group doesn't contain one of our documents ids, we continue to the next group at this +//! same level. +//! diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index e6ea92543..d92a8e4bd 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -5,5 +5,6 @@ pub(crate) use self::parser::Rule as ParserRule; mod facet_distribution; mod facet_number; +mod facet_string; mod filter_condition; mod parser; From a79661c6dc7fc605f0a4a0a87c7af73941ffb447 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 15:53:28 +0200 Subject: [PATCH 03/12] Introduce a lot of facet string helper iterators --- .../facet/facet_level_value_u32_codec.rs | 52 +++++++ .../facet/facet_string_level_zero_codec.rs | 49 ++++++ .../facet_string_zero_bounds_value_codec.rs | 80 ++++++++++ milli/src/heed_codec/facet/mod.rs | 6 + milli/src/search/facet/facet_string.rs | 140 +++++++++++++++++- 5 files changed, 319 insertions(+), 8 deletions(-) create mode 100644 milli/src/heed_codec/facet/facet_level_value_u32_codec.rs create mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_codec.rs create mode 100644 milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs new file mode 100644 index 000000000..6b51b306e --- /dev/null +++ b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs @@ -0,0 +1,52 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::num::NonZeroU8; + +use crate::FieldId; + +/// A codec that stores the field id, level 1 and higher and the groups ids. +/// +/// It can only be used to encode the facet string of the level 1 or higher. +pub struct FacetLevelValueU32Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { + type DItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + let level = NonZeroU8::new(*level)?; + let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?; + let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?; + Some((*field_id, level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { + type EItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // Write the big-endian integers. + let bytes = left.to_be_bytes(); + buffer[..4].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[4..8].copy_from_slice(&bytes[..]); + + // Then the u32 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[8..12].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[12..].copy_from_slice(&bytes[..]); + + let mut bytes = Vec::with_capacity(buffer.len() + 2); + bytes.push(*field_id); + bytes.push(level.get()); + bytes.extend_from_slice(&buffer); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs new file mode 100644 index 000000000..1c0c4be93 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs @@ -0,0 +1,49 @@ +use std::borrow::Cow; +use std::str; + +use crate::FieldId; + +/// A codec that stores the field id, level 0, and facet string. +/// +/// It can only be used to encode the facet string of the level 0, +/// as it hardcodes the level. +/// +/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, +/// and make sure that the levels are not mixed-up. The level 0 is special, the key +/// are strings, other levels represent groups and keys are simply two integers. +pub struct FacetStringLevelZeroCodec; + +impl FacetStringLevelZeroCodec { + pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { + out.reserve(value.len() + 2); + out.push(field_id); + out.push(0); // the level zero (for LMDB ordering only) + out.extend_from_slice(value.as_bytes()); + } +} + +impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { + type DItem = (FieldId, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + + if *level != 0 { + return None; + } + + let value = str::from_utf8(bytes).ok()?; + Some((*field_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { + type EItem = (FieldId, &'a str); + + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::new(); + FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs new file mode 100644 index 000000000..3c2ce4657 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +/// A codec that encodes two strings in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must +/// know the origin of a group, the group left and right bounds are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringZeroBoundsValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (Option<(&'a str, &'a str)>, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (contains_bounds, tail_bytes) = bytes.split_first()?; + + if *contains_bounds != 0 { + let (left_len, bytes) = try_split_at(bytes, 2)?; + let (right_len, bytes) = try_split_at(bytes, 2)?; + + let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; + let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; + + let (left, bytes) = try_split_at(bytes, left_len as usize)?; + let (right, bytes) = try_split_at(bytes, right_len as usize)?; + + let left = str::from_utf8(left).ok()?; + let right = str::from_utf8(right).ok()?; + + C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) + } else { + C::bytes_decode(tail_bytes).map(|item| (None, item)) + } + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (Option<(&'a str, &'a str)>, C::EItem); + + fn bytes_encode((bounds, value): &'a Self::EItem) -> Option> { + let mut bytes = Vec::new(); + + match bounds { + Some((left, right)) => { + let left_len: u16 = left.len().try_into().ok()?; + let right_len: u16 = right.len().try_into().ok()?; + bytes.extend_from_slice(&left_len.to_be_bytes()); + bytes.extend_from_slice(&right_len.to_be_bytes()); + + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } + None => { + bytes.push(0); + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + Some(Cow::Owned(bytes)) + } + } + } +} + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 532da12fa..90dc79134 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,9 +1,15 @@ mod facet_level_value_f64_codec; +mod facet_level_value_u32_codec; +mod facet_string_level_zero_codec; +mod facet_string_zero_bounds_value_codec; mod facet_value_string_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; +pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::facet_value_string_codec::FacetValueStringCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 61fc32f8e..d4d85153f 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -31,7 +31,7 @@ //! //! ### Example of what a facet number LMDB database contain //! -//! | level | left-bound | right-bound | docs | +//! | level | left-bound | right-bound | documents ids | //! |-------|------------|-------------|------------------| //! | 0 | 0 | _skipped_ | 1, 2 | //! | 0 | 1 | _skipped_ | 6, 7 | @@ -48,7 +48,7 @@ //! The next levels have two different bounds and the associated documents ids are simply the result //! of an union of all the documents ids associated with the aggregated groups above. //! -//! ## The complexity of defining groups of facet strings +//! ## The complexity of defining groups for facet strings //! //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in //! lexicographical order, it means that whatever the key represent the bytes are read in their raw @@ -77,22 +77,25 @@ //! //! #### Example of facet strings with numbered groups //! -//! | level | left-bound | right-bound | left-string | right-string | docs | +//! | level | left-bound | right-bound | left-string | right-string | documents ids | //! |-------|------------|-------------|-------------|--------------|------------------| //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -//! | 1 | 3 | 5 | gamma | omega | 2, 3, 4, 7 | -//! | 2 | 0 | 5 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | +//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | //! //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not //! need to store the facet string value two times. //! -//! In the value, not in the key, you can see that we added two new values: -//! the left-string and the right-string, which defines the original facet strings associated with -//! the given group. +//! The number in the left-bound and right-bound columns are incremental numbers representing the +//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering +//! of the LMDB keys. +//! +//! In the value, not in the key, you can see that we added two new values: the left-string and the +//! right-string, which defines the original facet strings associated with the given group. //! //! We put those two strings inside of the value, this way we do not limit the maximum size of the //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big @@ -121,3 +124,124 @@ //! If the group doesn't contain one of our documents ids, we continue to the next group at this //! same level. //! + +use std::num::NonZeroU8; +use std::ops::Bound; +use std::ops::Bound::{Excluded, Included}; + +use heed::types::{ByteSlice, Str}; +use heed::{Database, LazyDecode, RoRange}; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, +}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::FieldId; + +/// An iterator that is used to explore the facets level strings +/// from the level 1 to infinity. +/// +/// It yields the level, group id that an entry covers, the optional group strings +/// that it covers of the level 0 only if it is an entry from the level 1 and +/// the roaring bitmap associated. +pub struct FacetStringGroupRange<'t> { + iter: RoRange< + 't, + FacetLevelValueU32Codec, + LazyDecode>, + >, + end: Bound, +} + +impl<'t> FacetStringGroupRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >, + field_id: FieldId, + level: NonZeroU8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, u32::MIN)), + Excluded(left) => Excluded((field_id, level, left, u32::MIN)), + Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), + }; + let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetStringGroupRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetStringGroupRange<'t> { + type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => Some(Ok(((level, left, right), docids))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +/// An iterator that is used to explore the level 0 of the facets string database. +/// +/// It yields the facet string and the roaring bitmap associated with it. +pub struct FacetStringLevelZeroRange<'t> { + iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, +} + +impl<'t> FacetStringLevelZeroRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + left: Bound<&str>, + right: Bound<&str>, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, left)), + Excluded(left) => Excluded((field_id, left)), + Unbounded => Included((field_id, "")), + }; + + let right_bound = match right { + Included(right) => Included((field_id, right)), + Excluded(right) => Excluded((field_id, right)), + Unbounded => Excluded((field_id + 1, "")), + }; + + db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter }) + } +} + +impl<'t> Iterator for FacetStringLevelZeroRange<'t> { + type Item = heed::Result<(&'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))), + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} From adfd4da24cbd17b3d07a55d8a54b4555090c4e54 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 16:36:51 +0200 Subject: [PATCH 04/12] Introduce the FacetStringIter iterator --- milli/src/search/facet/facet_number.rs | 2 +- milli/src/search/facet/facet_string.rs | 149 +++++++++++++++++++++++-- 2 files changed, 140 insertions(+), 11 deletions(-) diff --git a/milli/src/search/facet/facet_number.rs b/milli/src/search/facet/facet_number.rs index f943b96da..02390aac1 100644 --- a/milli/src/search/facet/facet_number.rs +++ b/milli/src/search/facet/facet_number.rs @@ -147,7 +147,7 @@ impl<'t> FacetNumberIter<'t> { field_id: FieldId, documents_ids: RoaringBitmap, ) -> heed::Result> { - let db = index.facet_id_f64_docids.remap_key_type::(); + let db = index.facet_id_f64_docids; let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = FacetNumberRevRange::new(rtxn, db, field_id, highest_level, Unbounded, Unbounded)?; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index d4d85153f..09781f883 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -127,9 +127,10 @@ use std::num::NonZeroU8; use std::ops::Bound; -use std::ops::Bound::{Excluded, Included}; +use std::ops::Bound::{Excluded, Included, Unbounded}; -use heed::types::{ByteSlice, Str}; +use either::{Either, Left, Right}; +use heed::types::{ByteSlice, DecodeIgnore, Str}; use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; @@ -137,7 +138,7 @@ use crate::heed_codec::facet::{ FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; -use crate::FieldId; +use crate::{FieldId, Index}; /// An iterator that is used to explore the facets level strings /// from the level 1 to infinity. @@ -155,17 +156,18 @@ pub struct FacetStringGroupRange<'t> { } impl<'t> FacetStringGroupRange<'t> { - pub fn new( + pub fn new( rtxn: &'t heed::RoTxn, - db: Database< - FacetLevelValueU32Codec, - FacetStringZeroBoundsValueCodec, - >, + db: Database, field_id: FieldId, level: NonZeroU8, left: Bound, right: Bound, ) -> heed::Result> { + let db = db.remap_types::< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >(); let left_bound = match left { Included(left) => Included((field_id, level, left, u32::MIN)), Excluded(left) => Excluded((field_id, level, left, u32::MIN)), @@ -211,13 +213,14 @@ pub struct FacetStringLevelZeroRange<'t> { } impl<'t> FacetStringLevelZeroRange<'t> { - pub fn new( + pub fn new( rtxn: &'t heed::RoTxn, - db: Database, + db: Database, field_id: FieldId, left: Bound<&str>, right: Bound<&str>, ) -> heed::Result> { + let db = db.remap_types::(); let left_bound = match left { Included(left) => Included((field_id, left)), Excluded(left) => Excluded((field_id, left)), @@ -245,3 +248,129 @@ impl<'t> Iterator for FacetStringLevelZeroRange<'t> { } } } + +/// An iterator that is used to explore the facet strings level by level, +/// it will only return facets strings that are associated with the +/// candidates documents ids given. +pub struct FacetStringIter<'t> { + rtxn: &'t heed::RoTxn<'t>, + db: Database, + field_id: FieldId, + level_iters: + Vec<(RoaringBitmap, Either, FacetStringLevelZeroRange<'t>>)>, +} + +impl<'t> FacetStringIter<'t> { + pub fn new_non_reducing( + rtxn: &'t heed::RoTxn, + index: &'t Index, + field_id: FieldId, + documents_ids: RoaringBitmap, + ) -> heed::Result> { + // TODO make sure that we change the database before using it, or merging the PR. + let db = index.facet_id_string_docids.remap_types::(); + let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); + let highest_iter = match NonZeroU8::new(highest_level) { + Some(highest_level) => Left(FacetStringGroupRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + highest_level, + Unbounded, + Unbounded, + )?), + None => Right(FacetStringLevelZeroRange::new( + rtxn, + index.facet_id_string_docids, + field_id, + Unbounded, + Unbounded, + )?), + }; + + Ok(FacetStringIter { rtxn, db, field_id, level_iters: vec![(documents_ids, highest_iter)] }) + } + + fn highest_level( + rtxn: &'t heed::RoTxn, + db: Database, + fid: FieldId, + ) -> heed::Result> { + Ok(db + .remap_types::() + .prefix_iter(rtxn, &[fid][..])? // the field id is the first bit + .last() + .transpose()? + .map(|(key_bytes, _)| key_bytes[1])) // the level is the second bit + } +} + +impl<'t> Iterator for FacetStringIter<'t> { + type Item = heed::Result<(&'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + 'outer: loop { + let (documents_ids, last) = self.level_iters.last_mut()?; + match last { + Left(last) => { + for result in last { + match result { + Ok(((level, left, right), (string_bounds, mut docids))) => { + docids &= &*documents_ids; + if !docids.is_empty() { + *documents_ids -= &docids; + + let result = match string_bounds { + Some((left, right)) => FacetStringLevelZeroRange::new( + self.rtxn, + self.db, + self.field_id, + Included(left), + Included(right), + ) + .map(Right), + None => FacetStringGroupRange::new( + self.rtxn, + self.db, + self.field_id, + NonZeroU8::new(level.get() - 1).unwrap(), + Included(left), + Included(right), + ) + .map(Left), + }; + + match result { + Ok(iter) => { + self.level_iters.push((docids, iter)); + continue 'outer; + } + Err(e) => return Some(Err(e)), + } + } + } + Err(e) => return Some(Err(e)), + } + } + } + Right(last) => { + // level zero only + for result in last { + match result { + Ok((value, mut docids)) => { + docids &= &*documents_ids; + if !docids.is_empty() { + *documents_ids -= &docids; + return Some(Ok((value, docids))); + } + } + Err(e) => return Some(Err(e)), + } + } + } + } + + self.level_iters.pop(); + } + } +} From 757b2b502aab738f7539fbfc036fa7c8135e7ff4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 16:43:27 +0200 Subject: [PATCH 05/12] Remove the FacetValueStringCodec --- .../facet/facet_level_value_u32_codec.rs | 15 ++++++++------- .../facet/facet_string_level_zero_codec.rs | 9 +++++---- milli/src/heed_codec/facet/mod.rs | 2 -- milli/src/index.rs | 4 ++-- milli/src/search/facet/facet_distribution.rs | 8 ++++---- milli/src/search/facet/facet_string.rs | 6 +++--- milli/src/search/facet/filter_condition.rs | 4 ++-- milli/src/update/index_documents/store.rs | 4 ++-- 8 files changed, 26 insertions(+), 26 deletions(-) diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs index 6b51b306e..597335b6e 100644 --- a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs +++ b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs @@ -2,7 +2,7 @@ use std::borrow::Cow; use std::convert::TryInto; use std::num::NonZeroU8; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; /// A codec that stores the field id, level 1 and higher and the groups ids. /// @@ -13,12 +13,13 @@ impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { type DItem = (FieldId, NonZeroU8, u32, u32); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let (level, bytes) = bytes.split_first()?; let level = NonZeroU8::new(*level)?; - let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?; - let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?; - Some((*field_id, level, left, right)) + let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?; + let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?; + Some((field_id, level, left, right)) } } @@ -42,8 +43,8 @@ impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { let bytes = right.to_be_bytes(); buffer[12..].copy_from_slice(&bytes[..]); - let mut bytes = Vec::with_capacity(buffer.len() + 2); - bytes.push(*field_id); + let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1); + bytes.extend_from_slice(&field_id.to_be_bytes()); bytes.push(level.get()); bytes.extend_from_slice(&buffer); diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs index 1c0c4be93..009c6454a 100644 --- a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::str; -use crate::FieldId; +use crate::{try_split_array_at, FieldId}; /// A codec that stores the field id, level 0, and facet string. /// @@ -16,7 +16,7 @@ pub struct FacetStringLevelZeroCodec; impl FacetStringLevelZeroCodec { pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { out.reserve(value.len() + 2); - out.push(field_id); + out.extend_from_slice(&field_id.to_be_bytes()); out.push(0); // the level zero (for LMDB ordering only) out.extend_from_slice(value.as_bytes()); } @@ -26,7 +26,8 @@ impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { type DItem = (FieldId, &'a str); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (field_id, bytes) = bytes.split_first()?; + let (field_id_bytes, bytes) = try_split_array_at(bytes)?; + let field_id = u16::from_be_bytes(field_id_bytes); let (level, bytes) = bytes.split_first()?; if *level != 0 { @@ -34,7 +35,7 @@ impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { } let value = str::from_utf8(bytes).ok()?; - Some((*field_id, value)) + Some((field_id, value)) } } diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 90dc79134..ecab7eb7c 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -2,7 +2,6 @@ mod facet_level_value_f64_codec; mod facet_level_value_u32_codec; mod facet_string_level_zero_codec; mod facet_string_zero_bounds_value_codec; -mod facet_value_string_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; @@ -10,6 +9,5 @@ pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; -pub use self::facet_value_string_codec::FacetValueStringCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/index.rs b/milli/src/index.rs index 099a5891d..b2be10767 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -11,7 +11,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::{ @@ -91,7 +91,7 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, /// Maps the facet field id and the string with the docids that corresponds to it. - pub facet_id_string_docids: Database, + pub facet_id_string_docids: Database, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 080fd9af7..ceefe785b 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -1,6 +1,6 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; -use std::{cmp, fmt}; +use std::{cmp, fmt, mem}; use heed::types::{ByteSlice, Unit}; use heed::{BytesDecode, Database}; @@ -8,7 +8,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetValueStringCodec; +use crate::heed_codec::facet::FacetStringLevelZeroCodec; use crate::search::facet::{FacetNumberIter, FacetNumberRange}; use crate::{DocumentId, FieldId, Index, Result}; @@ -81,7 +81,7 @@ impl<'a> FacetDistribution<'a> { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { - key_buffer.truncate(1); + key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db .remap_key_type::() @@ -158,7 +158,7 @@ impl<'a> FacetDistribution<'a> { .facet_id_string_docids .remap_key_type::() .prefix_iter(self.rtxn, &field_id.to_be_bytes())? - .remap_key_type::(); + .remap_key_type::(); for result in iter { let ((_, value), docids) = result?; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 09781f883..d0d9b54eb 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -130,7 +130,7 @@ use std::ops::Bound; use std::ops::Bound::{Excluded, Included, Unbounded}; use either::{Either, Left, Right}; -use heed::types::{ByteSlice, DecodeIgnore, Str}; +use heed::types::{ByteSlice, DecodeIgnore}; use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; @@ -298,10 +298,10 @@ impl<'t> FacetStringIter<'t> { ) -> heed::Result> { Ok(db .remap_types::() - .prefix_iter(rtxn, &[fid][..])? // the field id is the first bit + .prefix_iter(rtxn, &fid.to_be_bytes())? // the field id is the first two bits .last() .transpose()? - .map(|(key_bytes, _)| key_bytes[1])) // the level is the second bit + .map(|(key_bytes, _)| key_bytes[2])) // the level is the third bit } } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 875fe3b27..c5ecb5a79 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -17,7 +17,7 @@ use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; use crate::error::UserError; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetValueStringCodec}; +use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetStringLevelZeroCodec}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] @@ -363,7 +363,7 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database, + strings_db: heed::Database, field_id: FieldId, operator: &Operator, ) -> Result { diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index ebf365f44..f0225ff43 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -26,7 +26,7 @@ use super::merge_function::{ use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetValueStringCodec, FieldDocIdFacetF64Codec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; @@ -522,7 +522,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { key_buffer.clear(); data_buffer.clear(); - FacetValueStringCodec::serialize_into(field_id, &value, &mut key_buffer); + FacetStringLevelZeroCodec::serialize_into(field_id, &value, &mut key_buffer); CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); if lmdb_key_valid_size(&key_buffer) { From a7ae552ba78b19d7e2068f7d23374817b8019258 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:05:57 +0200 Subject: [PATCH 06/12] Fix the FacetStringLevelZeroRange range when unbounded --- milli/src/search/facet/facet_string.rs | 48 +++++++++++++++++++------- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index d0d9b54eb..559bd41b6 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -210,6 +210,7 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, + field_id: FieldId, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -220,20 +221,43 @@ impl<'t> FacetStringLevelZeroRange<'t> { left: Bound<&str>, right: Bound<&str>, ) -> heed::Result> { - let db = db.remap_types::(); - let left_bound = match left { - Included(left) => Included((field_id, left)), - Excluded(left) => Excluded((field_id, left)), - Unbounded => Included((field_id, "")), - }; + fn encode_bound<'a>( + buffer: &'a mut Vec, + field_id: FieldId, + bound: Bound<&str>, + ) -> Bound<&'a [u8]> { + match bound { + Included(value) => { + buffer.push(field_id); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + Included(&buffer[..]) + } + Excluded(value) => { + buffer.push(field_id); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + Excluded(&buffer[..]) + } + Unbounded => { + buffer.push(field_id); + buffer.push(1); // we must only get the level 0 + Excluded(&buffer[..]) + } + } + } - let right_bound = match right { - Included(right) => Included((field_id, right)), - Excluded(right) => Excluded((field_id, right)), - Unbounded => Excluded((field_id + 1, "")), - }; + let mut left_buffer = Vec::new(); + let mut right_buffer = Vec::new(); + let left_bound = encode_bound(&mut left_buffer, field_id, left); + let right_bound = encode_bound(&mut right_buffer, field_id, right); - db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter }) + let iter = db + .remap_key_type::() + .range(rtxn, &(left_bound, right_bound))? + .remap_types::(); + + Ok(FacetStringLevelZeroRange { iter, field_id }) } } From 8c86348119a431cdffa784f8769b3448c77e00de Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 17:20:04 +0200 Subject: [PATCH 07/12] Indexing the facet strings levels --- milli/src/search/facet/facet_string.rs | 9 +- milli/src/update/facets.rs | 152 ++++++++++++++++++++++--- 2 files changed, 142 insertions(+), 19 deletions(-) diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 559bd41b6..509bb4f0c 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -210,7 +210,6 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, - field_id: FieldId, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -228,19 +227,19 @@ impl<'t> FacetStringLevelZeroRange<'t> { ) -> Bound<&'a [u8]> { match bound { Included(value) => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0); buffer.extend_from_slice(value.as_bytes()); Included(&buffer[..]) } Excluded(value) => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(0); buffer.extend_from_slice(value.as_bytes()); Excluded(&buffer[..]) } Unbounded => { - buffer.push(field_id); + buffer.extend_from_slice(&field_id.to_be_bytes()); buffer.push(1); // we must only get the level 0 Excluded(&buffer[..]) } @@ -257,7 +256,7 @@ impl<'t> FacetStringLevelZeroRange<'t> { .range(rtxn, &(left_bound, right_bound))? .remap_types::(); - Ok(FacetStringLevelZeroRange { iter, field_id }) + Ok(FacetStringLevelZeroRange { iter }) } } diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index 5fabbc504..d3bba6d6e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -1,6 +1,6 @@ -use std::cmp; use std::fs::File; -use std::num::NonZeroUsize; +use std::num::{NonZeroU8, NonZeroUsize}; +use std::{cmp, mem}; use chrono::Utc; use grenad::{CompressionType, FileFuse, Reader, Writer}; @@ -10,7 +10,10 @@ use log::debug; use roaring::RoaringBitmap; use crate::error::InternalError; -use crate::heed_codec::facet::FacetLevelValueF64Codec; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, + FacetStringZeroBoundsValueCodec, +}; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ create_writer, write_into_lmdb_database, writer_into_reader, WriteMethod, @@ -64,6 +67,13 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { debug!("Computing and writing the facet values levels docids into LMDB on disk..."); for field_id in faceted_fields { + // Clear the facet string levels. + clear_field_string_levels( + self.wtxn, + self.index.facet_id_string_docids.remap_types::(), + field_id, + )?; + // Compute and store the faceted strings documents ids. let string_documents_ids = compute_faceted_documents_ids( self.wtxn, @@ -71,6 +81,17 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; + let facet_string_levels = compute_facet_string_levels( + self.wtxn, + self.index.facet_id_string_docids, + self.chunk_compression_type, + self.chunk_compression_level, + self.chunk_fusing_shrink_size, + self.level_group_size, + self.min_level_size, + field_id, + )?; + // Clear the facet number levels. clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; @@ -81,7 +102,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { field_id, )?; - let content = compute_facet_number_levels( + let facet_number_levels = compute_facet_number_levels( self.wtxn, self.index.facet_id_f64_docids, self.chunk_compression_type, @@ -106,8 +127,16 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { write_into_lmdb_database( self.wtxn, *self.index.facet_id_f64_docids.as_polymorph(), - content, - |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number level" }), + facet_number_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet number levels" }), + WriteMethod::GetMergePut, + )?; + + write_into_lmdb_database( + self.wtxn, + *self.index.facet_id_string_docids.as_polymorph(), + facet_string_levels, + |_, _| Err(InternalError::IndexingMergingKeys { process: "facet string levels" }), WriteMethod::GetMergePut, )?; } @@ -193,6 +222,21 @@ fn compute_facet_number_levels<'t>( writer_into_reader(writer, shrink_size) } +fn write_number_entry( + writer: &mut Writer, + field_id: FieldId, + level: u8, + left: f64, + right: f64, + ids: &RoaringBitmap, +) -> Result<()> { + let key = (field_id, level, left, right); + let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + writer.insert(&key, &data)?; + Ok(()) +} + fn compute_faceted_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, @@ -208,17 +252,97 @@ fn compute_faceted_documents_ids( Ok(documents_ids) } -fn write_number_entry( +fn clear_field_string_levels<'t>( + wtxn: &'t mut heed::RwTxn, + db: heed::Database, + field_id: FieldId, +) -> heed::Result<()> { + let left = (field_id, NonZeroU8::new(1).unwrap(), u32::MIN, u32::MIN); + let right = (field_id, NonZeroU8::new(u8::MAX).unwrap(), u32::MAX, u32::MAX); + let range = left..=right; + db.remap_key_type::().delete_range(wtxn, &range).map(drop) +} + +fn compute_facet_string_levels<'t>( + rtxn: &'t heed::RoTxn, + db: heed::Database, + compression_type: CompressionType, + compression_level: Option, + shrink_size: Option, + level_group_size: NonZeroUsize, + min_level_size: NonZeroUsize, + field_id: FieldId, +) -> Result> { + let first_level_size = db + .remap_key_type::() + .prefix_iter(rtxn, &field_id.to_be_bytes())? + .remap_types::() + .fold(Ok(0usize), |count, result| result.and(count).map(|c| c + 1))?; + + // It is forbidden to keep a cursor and write in a database at the same time with LMDB + // therefore we write the facet levels entries into a grenad file before transfering them. + let mut writer = tempfile::tempfile() + .and_then(|file| create_writer(compression_type, compression_level, file))?; + + // Groups sizes are always a power of the original level_group_size and therefore a group + // always maps groups of the previous level and never splits previous levels groups in half. + let group_size_iter = (1u8..) + .map(|l| (l, level_group_size.get().pow(l as u32))) + .take_while(|(_, s)| first_level_size / *s >= min_level_size.get()); + + for (level, group_size) in group_size_iter { + let level = NonZeroU8::new(level).unwrap(); + let mut left = (0, ""); + let mut right = (0, ""); + let mut group_docids = RoaringBitmap::new(); + + // Because we know the size of the level 0 we can use a range iterator that starts + // at the first value of the level and goes to the last by simply counting. + for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { + let ((_field_id, value), docids) = result?; + + if i == 0 { + left = (i as u32, value); + } else if i % group_size == 0 { + // we found the first bound of the next group, we must store the left + // and right bounds associated with the docids. We also reset the docids. + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + + // We save the left bound for the new group. + left = (i as u32, value); + } + + // The right bound is always the bound we run through. + group_docids |= docids; + right = (i as u32, value); + } + + if !group_docids.is_empty() { + let docids = mem::take(&mut group_docids); + write_string_entry(&mut writer, field_id, level, left, right, docids)?; + } + } + + writer_into_reader(writer, shrink_size) +} + +fn write_string_entry( writer: &mut Writer, field_id: FieldId, - level: u8, - left: f64, - right: f64, - ids: &RoaringBitmap, + level: NonZeroU8, + (left_id, left_value): (u32, &str), + (right_id, right_value): (u32, &str), + docids: RoaringBitmap, ) -> Result<()> { - let key = (field_id, level, left, right); - let key = FacetLevelValueF64Codec::bytes_encode(&key).ok_or(Error::Encoding)?; - let data = CboRoaringBitmapCodec::bytes_encode(&ids).ok_or(Error::Encoding)?; + let key = (field_id, level, left_id, right_id); + let key = FacetLevelValueU32Codec::bytes_encode(&key).ok_or(Error::Encoding)?; + let data = match level.get() { + 1 => (Some((left_value, right_value)), docids), + _ => (None, docids), + }; + let data = FacetStringZeroBoundsValueCodec::::bytes_encode(&data) + .ok_or(Error::Encoding)?; writer.insert(&key, &data)?; Ok(()) } From 5676b204dd388c8cca7458ac7a3dd7ab7e0be565 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 4 Jul 2021 18:09:53 +0200 Subject: [PATCH 08/12] Fix the facet string levels codecs --- .../facet_string_zero_bounds_value_codec.rs | 46 ++++++++++++++++++- milli/src/update/index_documents/store.rs | 4 ++ 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 3c2ce4657..6161118b6 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -16,7 +16,7 @@ where type DItem = (Option<(&'a str, &'a str)>, C::DItem); fn bytes_decode(bytes: &'a [u8]) -> Option { - let (contains_bounds, tail_bytes) = bytes.split_first()?; + let (contains_bounds, bytes) = bytes.split_first()?; if *contains_bounds != 0 { let (left_len, bytes) = try_split_at(bytes, 2)?; @@ -33,7 +33,7 @@ where C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) } else { - C::bytes_decode(tail_bytes).map(|item| (None, item)) + C::bytes_decode(bytes).map(|item| (None, item)) } } } @@ -49,11 +49,21 @@ where match bounds { Some((left, right)) => { + bytes.push(u8::max_value()); + + if left.is_empty() || right.is_empty() { + return None; + } + let left_len: u16 = left.len().try_into().ok()?; let right_len: u16 = right.len().try_into().ok()?; + bytes.extend_from_slice(&left_len.to_be_bytes()); bytes.extend_from_slice(&right_len.to_be_bytes()); + bytes.extend_from_slice(left.as_bytes()); + bytes.extend_from_slice(right.as_bytes()); + let value_bytes = C::bytes_encode(&value)?; bytes.extend_from_slice(&value_bytes[..]); @@ -78,3 +88,35 @@ fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { None } } + +#[cfg(test)] +mod tests { + use heed::types::Unit; + use heed::{BytesDecode, BytesEncode}; + use roaring::RoaringBitmap; + + use super::*; + use crate::CboRoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmaps() { + let bounds = Some(("abc", "def")); + let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); + let key = (bounds, docids.clone()); + let bytes = + FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); + let (out_bounds, out_docids) = + FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_bounds, out_docids), (bounds, docids)); + } + + #[test] + fn deserialize_unit() { + let bounds = Some(("abc", "def")); + let key = (bounds, ()); + let bytes = FacetStringZeroBoundsValueCodec::::bytes_encode(&key).unwrap(); + let (out_bounds, out_unit) = + FacetStringZeroBoundsValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_bounds, out_unit), (bounds, ())); + } +} diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index f0225ff43..4c1071aab 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -286,6 +286,10 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { value: String, id: DocumentId, ) -> Result<()> { + if value.is_empty() { + return Ok(()); + } + let sorter = &mut self.field_id_docid_facet_strings_sorter; Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; From 081278dfd6b99f720f409db92bdef8193bdadf16 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 4 Jul 2021 18:11:26 +0200 Subject: [PATCH 09/12] Use the facet string levels when computing the facet distribution --- milli/src/search/facet/facet_distribution.rs | 28 +++++++++++++++++--- milli/src/search/facet/facet_string.rs | 2 +- milli/src/search/facet/mod.rs | 1 + 3 files changed, 27 insertions(+), 4 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index ceefe785b..6382e15e1 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -9,7 +9,7 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; use crate::heed_codec::facet::FacetStringLevelZeroCodec; -use crate::search::facet::{FacetNumberIter, FacetNumberRange}; +use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; use crate::{DocumentId, FieldId, Index, Result}; /// The default number of values by facets that will @@ -134,6 +134,29 @@ impl<'a> FacetDistribution<'a> { Ok(()) } + fn facet_strings_distribution_from_facet_levels( + &self, + field_id: FieldId, + candidates: &RoaringBitmap, + distribution: &mut BTreeMap, + ) -> heed::Result<()> { + let iter = + FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; + + for result in iter { + let (value, mut docids) = result?; + docids &= candidates; + if !docids.is_empty() { + distribution.insert(value.to_string(), docids.len()); + } + if distribution.len() == self.max_values_by_facet { + break; + } + } + + Ok(()) + } + /// Placeholder search, a.k.a. no candidates were specified. We iterate throught the /// facet values one by one and iterate on the facet level 0 for numbers. fn facet_values_from_raw_facet_database( @@ -198,9 +221,8 @@ impl<'a> FacetDistribution<'a> { candidates, &mut distribution, )?; - self.facet_distribution_from_documents( + self.facet_strings_distribution_from_facet_levels( field_id, - String, candidates, &mut distribution, )?; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 509bb4f0c..e1fe6ab74 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -192,7 +192,7 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { }; if must_be_returned { match docids.decode() { - Ok(docids) => Some(Ok(((level, left, right), docids))), + Ok((bounds, docids)) => Some(Ok(((level, left, right), (bounds, docids)))), Err(e) => Some(Err(e)), } } else { diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index d92a8e4bd..ddf710e32 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,5 +1,6 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; +pub use self::facet_string::FacetStringIter; pub use self::filter_condition::{FilterCondition, Operator}; pub(crate) use self::parser::Rule as ParserRule; From d23c250ad5928878a6db1f61272a6fec7093a1dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 13 Jul 2021 20:04:48 +0200 Subject: [PATCH 10/12] Fix a bound error in the facet string range construction --- milli/src/search/facet/facet_distribution.rs | 4 +- milli/src/search/facet/facet_string.rs | 51 +++++++++----------- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 6382e15e1..fef4ecc87 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -22,7 +22,7 @@ const MAX_VALUES_BY_FACET: usize = 1000; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 1000; +const CANDIDATES_THRESHOLD: u64 = 35_000; pub struct FacetDistribution<'a> { facets: Option>, @@ -80,7 +80,7 @@ impl<'a> FacetDistribution<'a> { { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - for docid in candidates.into_iter().take(CANDIDATES_THRESHOLD as usize) { + for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); key_buffer.extend_from_slice(&docid.to_be_bytes()); let iter = db diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index e1fe6ab74..f0b527104 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -220,36 +220,34 @@ impl<'t> FacetStringLevelZeroRange<'t> { left: Bound<&str>, right: Bound<&str>, ) -> heed::Result> { - fn encode_bound<'a>( - buffer: &'a mut Vec, - field_id: FieldId, - bound: Bound<&str>, - ) -> Bound<&'a [u8]> { - match bound { - Included(value) => { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - Included(&buffer[..]) - } - Excluded(value) => { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(0); - buffer.extend_from_slice(value.as_bytes()); - Excluded(&buffer[..]) - } - Unbounded => { - buffer.extend_from_slice(&field_id.to_be_bytes()); - buffer.push(1); // we must only get the level 0 - Excluded(&buffer[..]) - } - } + fn encode_value<'a>(buffer: &'a mut Vec, field_id: FieldId, value: &str) -> &'a [u8] { + buffer.extend_from_slice(&field_id.to_be_bytes()); + buffer.push(0); + buffer.extend_from_slice(value.as_bytes()); + &buffer[..] } let mut left_buffer = Vec::new(); + let left_bound = match left { + Included(value) => Included(encode_value(&mut left_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut left_buffer, field_id, value)), + Unbounded => { + left_buffer.extend_from_slice(&field_id.to_be_bytes()); + left_buffer.push(0); + Included(&left_buffer[..]) + } + }; + let mut right_buffer = Vec::new(); - let left_bound = encode_bound(&mut left_buffer, field_id, left); - let right_bound = encode_bound(&mut right_buffer, field_id, right); + let right_bound = match right { + Included(value) => Included(encode_value(&mut right_buffer, field_id, value)), + Excluded(value) => Excluded(encode_value(&mut right_buffer, field_id, value)), + Unbounded => { + right_buffer.extend_from_slice(&field_id.to_be_bytes()); + right_buffer.push(1); // we must only get the level 0 + Excluded(&right_buffer[..]) + } + }; let iter = db .remap_key_type::() @@ -290,7 +288,6 @@ impl<'t> FacetStringIter<'t> { field_id: FieldId, documents_ids: RoaringBitmap, ) -> heed::Result> { - // TODO make sure that we change the database before using it, or merging the PR. let db = index.facet_id_string_docids.remap_types::(); let highest_level = Self::highest_level(rtxn, db, field_id)?.unwrap_or(0); let highest_iter = match NonZeroU8::new(highest_level) { From 03a01166bafa011f90920b945d69e8b67aa9ed5b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 15 Jul 2021 10:19:35 +0200 Subject: [PATCH 11/12] Display the original facet string value from the linear facet database --- .../facet/field_doc_id_facet_string_codec.rs | 19 +++-- milli/src/index.rs | 2 +- milli/src/search/distinct/facet_distinct.rs | 8 +- milli/src/search/facet/facet_distribution.rs | 83 ++++++++++--------- milli/src/update/delete_documents.rs | 11 +-- milli/src/update/index_documents/store.rs | 63 ++++++++------ milli/src/update/settings.rs | 3 +- 7 files changed, 108 insertions(+), 81 deletions(-) diff --git a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs index 36408f578..178bb21c1 100644 --- a/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs +++ b/milli/src/heed_codec/facet/field_doc_id_facet_string_codec.rs @@ -9,13 +9,13 @@ impl FieldDocIdFacetStringCodec { pub fn serialize_into( field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, out: &mut Vec, ) { - out.reserve(2 + 4 + value.len()); + out.reserve(2 + 4 + normalized_value.len()); out.extend_from_slice(&field_id.to_be_bytes()); out.extend_from_slice(&document_id.to_be_bytes()); - out.extend_from_slice(value.as_bytes()); + out.extend_from_slice(normalized_value.as_bytes()); } } @@ -29,17 +29,22 @@ impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec { let (document_id_bytes, bytes) = try_split_array_at(bytes)?; let document_id = u32::from_be_bytes(document_id_bytes); - let value = str::from_utf8(bytes).ok()?; - Some((field_id, document_id, value)) + let normalized_value = str::from_utf8(bytes).ok()?; + Some((field_id, document_id, normalized_value)) } } impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec { type EItem = (FieldId, DocumentId, &'a str); - fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option> { + fn bytes_encode((field_id, document_id, normalized_value): &Self::EItem) -> Option> { let mut bytes = Vec::new(); - FieldDocIdFacetStringCodec::serialize_into(*field_id, *document_id, value, &mut bytes); + FieldDocIdFacetStringCodec::serialize_into( + *field_id, + *document_id, + normalized_value, + &mut bytes, + ); Some(Cow::Owned(bytes)) } } diff --git a/milli/src/index.rs b/milli/src/index.rs index b2be10767..efc31ab46 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -96,7 +96,7 @@ pub struct Index { /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, /// Maps the document id, the facet field id and the strings. - pub field_id_docid_facet_strings: Database, + pub field_id_docid_facet_strings: Database, /// Maps the document id to the document as an obkv store. pub documents: Database, ObkvCodec>, diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index 91620da2a..d81f20732 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,6 +1,6 @@ use std::mem::size_of; -use heed::types::ByteSlice; +use heed::types::{ByteSlice, Str, Unit}; use roaring::RoaringBitmap; use super::{Distinct, DocIter}; @@ -127,7 +127,7 @@ fn facet_number_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index @@ -144,14 +144,14 @@ fn facet_string_values<'a>( distinct: FieldId, index: &Index, txn: &'a heed::RoTxn, -) -> Result> { +) -> Result> { let key = facet_values_prefix_key(distinct, id); let iter = index .field_id_docid_facet_strings .remap_key_type::() .prefix_iter(txn, &key)? - .remap_key_type::(); + .remap_types::(); Ok(iter) } diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index fef4ecc87..7c9acf276 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -2,15 +2,16 @@ use std::collections::{BTreeMap, HashSet}; use std::ops::Bound::Unbounded; use std::{cmp, fmt, mem}; -use heed::types::{ByteSlice, Unit}; -use heed::{BytesDecode, Database}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, UserError}; use crate::facet::FacetType; -use crate::heed_codec::facet::FacetStringLevelZeroCodec; +use crate::heed_codec::facet::{ + FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, +}; use crate::search::facet::{FacetNumberIter, FacetNumberRange, FacetStringIter}; -use crate::{DocumentId, FieldId, Index, Result}; +use crate::{FieldId, Index, Result}; /// The default number of values by facets that will /// be fetched from the key-value store. @@ -67,46 +68,55 @@ impl<'a> FacetDistribution<'a> { candidates: &RoaringBitmap, distribution: &mut BTreeMap, ) -> heed::Result<()> { - fn fetch_facet_values<'t, KC, K: 't>( - rtxn: &'t heed::RoTxn, - db: Database, - field_id: FieldId, - candidates: &RoaringBitmap, - distribution: &mut BTreeMap, - ) -> heed::Result<()> - where - K: fmt::Display, - KC: BytesDecode<'t, DItem = (FieldId, DocumentId, K)>, - { - let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); - - for docid in candidates.into_iter() { - key_buffer.truncate(mem::size_of::()); - key_buffer.extend_from_slice(&docid.to_be_bytes()); - let iter = db - .remap_key_type::() - .prefix_iter(rtxn, &key_buffer)? - .remap_key_type::(); - - for result in iter { - let ((_, _, value), ()) = result?; - *distribution.entry(value.to_string()).or_insert(0) += 1; - } - } - - Ok(()) - } - match facet_type { FacetType::Number => { + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let db = self.index.field_id_docid_facet_f64s; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, value), ()) = result?; + *distribution.entry(value.to_string()).or_insert(0) += 1; + } + } } FacetType::String => { + let mut normalized_distribution = BTreeMap::new(); + let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let db = self.index.field_id_docid_facet_strings; - fetch_facet_values(self.rtxn, db, field_id, candidates, distribution) + for docid in candidates.into_iter() { + key_buffer.truncate(mem::size_of::()); + key_buffer.extend_from_slice(&docid.to_be_bytes()); + let iter = db + .remap_key_type::() + .prefix_iter(self.rtxn, &key_buffer)? + .remap_key_type::(); + + for result in iter { + let ((_, _, normalized_value), original_value) = result?; + let (_, count) = normalized_distribution + .entry(normalized_value) + .or_insert_with(|| (original_value, 0)); + *count += 1; + } + } + + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); } } + + Ok(()) } /// There is too much documents, we use the facet levels to move throught @@ -227,7 +237,6 @@ impl<'a> FacetDistribution<'a> { &mut distribution, )?; } - Ok(distribution) } None => self.facet_values_from_raw_facet_database(field_id), diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 222f3b2d3..e9c1e507a 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -3,7 +3,7 @@ use std::collections::HashMap; use chrono::Utc; use fst::IntoStreamer; -use heed::types::{ByteSlice, Unit}; +use heed::types::ByteSlice; use roaring::RoaringBitmap; use serde_json::Value; @@ -419,15 +419,16 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { } } -fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F>( +fn remove_docids_from_field_id_docid_facet_value<'a, C, K, F, DC, V>( wtxn: &'a mut heed::RwTxn, - db: &heed::Database, + db: &heed::Database, field_id: FieldId, to_remove: &RoaringBitmap, convert: F, ) -> heed::Result<()> where - C: heed::BytesDecode<'a, DItem = K> + heed::BytesEncode<'a, EItem = K>, + C: heed::BytesDecode<'a, DItem = K>, + DC: heed::BytesDecode<'a, DItem = V>, F: Fn(K) -> DocumentId, { let mut iter = db @@ -436,7 +437,7 @@ where .remap_key_type::(); while let Some(result) = iter.next() { - let (key, ()) = result?; + let (key, _) = result?; if to_remove.contains(convert(key)) { // safety: we don't keep references from inside the LMDB database. unsafe { iter.del_current()? }; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 4c1071aab..1538295f9 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -65,7 +65,7 @@ pub struct Store<'s, A> { LinkedHashMap<(SmallVec32, SmallVec32, u8), RoaringBitmap>, words_pairs_proximities_docids_limit: usize, facet_field_number_docids: LinkedHashMap<(FieldId, OrderedFloat), RoaringBitmap>, - facet_field_string_docids: LinkedHashMap<(FieldId, String), RoaringBitmap>, + facet_field_string_docids: LinkedHashMap<(FieldId, String), (String, RoaringBitmap)>, facet_field_value_docids_limit: usize, // MTBL parameters chunk_compression_type: CompressionType, @@ -283,25 +283,33 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn insert_facet_string_values_docid( &mut self, field_id: FieldId, - value: String, + normalized_value: String, + original_value: String, id: DocumentId, ) -> Result<()> { - if value.is_empty() { + if normalized_value.is_empty() { return Ok(()); } let sorter = &mut self.field_id_docid_facet_strings_sorter; - Self::write_field_id_docid_facet_string_value(sorter, field_id, id, &value)?; + Self::write_field_id_docid_facet_string_value( + sorter, + field_id, + id, + &normalized_value, + &original_value, + )?; - let key = (field_id, value); + let key = (field_id, normalized_value); // if get_refresh finds the element it is assured to be at the end of the linked hash map. match self.facet_field_string_docids.get_refresh(&key) { - Some(old) => { + Some((_original_value, old)) => { old.insert(id); } None => { // A newly inserted element is append at the end of the linked hash map. - self.facet_field_string_docids.insert(key, RoaringBitmap::from_iter(Some(id))); + self.facet_field_string_docids + .insert(key, (original_value, RoaringBitmap::from_iter(Some(id)))); // If the word docids just reached it's capacity we must make sure to remove // one element, this way next time we insert we doesn't grow the capacity. if self.facet_field_string_docids.len() == self.facet_field_value_docids_limit { @@ -363,7 +371,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { document_id: DocumentId, words_positions: &mut HashMap>, facet_numbers_values: &mut HashMap>, - facet_strings_values: &mut HashMap>, + facet_strings_values: &mut HashMap>, record: &[u8], ) -> Result<()> { // We compute the list of words pairs proximities (self-join) and write it directly to disk. @@ -399,8 +407,8 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { // We store document_id associated with all the facet strings fields ids and values. for (field, values) in facet_strings_values.drain() { - for value in values { - self.insert_facet_string_values_docid(field, value, document_id)?; + for (normalized, original) in values { + self.insert_facet_string_values_docid(field, normalized, original, document_id)?; } } @@ -516,23 +524,23 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { fn write_facet_field_string_docids(sorter: &mut Sorter>, iter: I) -> Result<()> where - I: IntoIterator, + I: IntoIterator, Error: From, { let mut key_buffer = Vec::new(); let mut data_buffer = Vec::new(); - for ((field_id, value), docids) in iter { + for ((field_id, normalized_value), (original_value, docids)) in iter { key_buffer.clear(); data_buffer.clear(); - FacetStringLevelZeroCodec::serialize_into(field_id, &value, &mut key_buffer); + FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); if lmdb_key_valid_size(&key_buffer) { sorter.insert(&key_buffer, &data_buffer)?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } } @@ -587,19 +595,24 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { sorter: &mut Sorter>, field_id: FieldId, document_id: DocumentId, - value: &str, + normalized_value: &str, + original_value: &str, ) -> Result<()> where Error: From, { let mut buffer = Vec::new(); - - FieldDocIdFacetStringCodec::serialize_into(field_id, document_id, value, &mut buffer); + FieldDocIdFacetStringCodec::serialize_into( + field_id, + document_id, + normalized_value, + &mut buffer, + ); if lmdb_key_valid_size(&buffer) { - sorter.insert(&buffer, &[])?; + sorter.insert(&buffer, original_value.as_bytes())?; } else { - warn!("facet value {:?} is too large to be saved", value); + warn!("facet value {:?} is too large to be saved", original_value); } Ok(()) @@ -929,24 +942,24 @@ fn process_tokens<'a>( .filter(|(_, t)| t.is_word()) } -fn extract_facet_values(value: &Value) -> (Vec, Vec) { +fn extract_facet_values(value: &Value) -> (Vec, Vec<(String, String)>) { fn inner_extract_facet_values( value: &Value, can_recurse: bool, output_numbers: &mut Vec, - output_strings: &mut Vec, + output_strings: &mut Vec<(String, String)>, ) { match value { Value::Null => (), - Value::Bool(b) => output_strings.push(b.to_string()), + Value::Bool(b) => output_strings.push((b.to_string(), b.to_string())), Value::Number(number) => { if let Some(float) = number.as_f64() { output_numbers.push(float); } } - Value::String(string) => { - let string = string.trim().to_lowercase(); - output_strings.push(string); + Value::String(original) => { + let normalized = original.trim().to_lowercase(); + output_strings.push((normalized, original.clone())); } Value::Array(values) => { if can_recurse { diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index c6540b33a..e4adbccb9 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -276,8 +276,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { match self.searchable_fields { Setting::Set(ref fields) => { // every time the searchable attributes are updated, we need to update the - // ids for any settings that uses the facets. (displayed_fields, - // filterable_fields) + // ids for any settings that uses the facets. (distinct_fields, filterable_fields). let old_fields_ids_map = self.index.fields_ids_map(self.wtxn)?; let mut new_fields_ids_map = FieldsIdsMap::new(); From 0227254a65f9627099a74039932236522cff278e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 17 Jul 2021 12:50:01 +0200 Subject: [PATCH 12/12] Return the original string values for the inverted facet index database --- infos/src/main.rs | 4 +- .../facet_string_level_zero_value_codec.rs | 80 +++++++++++++++++++ .../facet_string_zero_bounds_value_codec.rs | 14 +--- milli/src/heed_codec/facet/mod.rs | 12 +++ milli/src/index.rs | 9 ++- milli/src/search/distinct/facet_distinct.rs | 11 +-- milli/src/search/distinct/mod.rs | 3 +- milli/src/search/facet/facet_distribution.rs | 24 ++++-- milli/src/search/facet/facet_string.rs | 26 ++++-- milli/src/search/facet/filter_condition.rs | 12 ++- milli/src/update/delete_documents.rs | 33 +++++++- milli/src/update/facets.rs | 30 +++++-- .../update/index_documents/merge_function.rs | 23 ++++++ milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/index_documents/store.rs | 16 ++-- 15 files changed, 242 insertions(+), 58 deletions(-) create mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs diff --git a/infos/src/main.rs b/infos/src/main.rs index d5d1ad0af..da15251b0 100644 --- a/infos/src/main.rs +++ b/infos/src/main.rs @@ -627,14 +627,14 @@ fn facet_values_docids( FacetType::String => { wtr.write_record(&["facet_string", "documents_count", "documents_ids"])?; for result in facet_values_iter(rtxn, index.facet_id_string_docids, field_id)? { - let ((_fid, value), docids) = result?; + let ((_fid, normalized), (_original, docids)) = result?; let count = docids.len(); let docids = if debug { format!("{:?}", docids) } else { format!("{:?}", docids.iter().collect::>()) }; - wtr.write_record(&[value.to_string(), count.to_string(), docids])?; + wtr.write_record(&[normalized.to_string(), count.to_string(), docids])?; } } } diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs new file mode 100644 index 000000000..b2434d453 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_value_codec.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +use super::try_split_at; + +/// A codec that encodes a string in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must know the +/// original string of a normalized facet value, the original values are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringLevelZeroValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (&'a str, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (string_len, bytes) = try_split_at(bytes, 2)?; + let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?; + + let (string, bytes) = try_split_at(bytes, string_len as usize)?; + let string = str::from_utf8(string).ok()?; + + C::bytes_decode(bytes).map(|item| (string, item)) + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (&'a str, C::EItem); + + fn bytes_encode((string, value): &'a Self::EItem) -> Option> { + let string_len: u16 = string.len().try_into().ok()?; + let value_bytes = C::bytes_encode(&value)?; + + let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len()); + bytes.extend_from_slice(&string_len.to_be_bytes()); + bytes.extend_from_slice(string.as_bytes()); + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } +} + +#[cfg(test)] +mod tests { + use heed::types::Unit; + use heed::{BytesDecode, BytesEncode}; + use roaring::RoaringBitmap; + + use super::*; + use crate::CboRoaringBitmapCodec; + + #[test] + fn deserialize_roaring_bitmaps() { + let string = "abc"; + let docids: RoaringBitmap = (0..100).chain(3500..4398).collect(); + let key = (string, docids.clone()); + let bytes = + FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_docids) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_docids), (string, docids)); + } + + #[test] + fn deserialize_unit() { + let string = "def"; + let key = (string, ()); + let bytes = FacetStringLevelZeroValueCodec::::bytes_encode(&key).unwrap(); + let (out_string, out_unit) = + FacetStringLevelZeroValueCodec::::bytes_decode(&bytes).unwrap(); + assert_eq!((out_string, out_unit), (string, ())); + } +} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs index 6161118b6..337433c2b 100644 --- a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -2,7 +2,9 @@ use std::borrow::Cow; use std::convert::TryInto; use std::{marker, str}; -/// A codec that encodes two strings in front of the value. +use super::try_split_at; + +/// A codec that optionally encodes two strings in front of the value. /// /// The usecase is for the facet string levels algorithm where we must /// know the origin of a group, the group left and right bounds are stored @@ -79,16 +81,6 @@ where } } -/// Tries to split a slice in half at the given middle point, -/// `None` if the slice is too short. -fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { - if slice.len() >= mid { - Some(slice.split_at(mid)) - } else { - None - } -} - #[cfg(test)] mod tests { use heed::types::Unit; diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index ecab7eb7c..a6a805bf7 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,6 +1,7 @@ mod facet_level_value_f64_codec; mod facet_level_value_u32_codec; mod facet_string_level_zero_codec; +mod facet_string_level_zero_value_codec; mod facet_string_zero_bounds_value_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; @@ -8,6 +9,17 @@ mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec; pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} diff --git a/milli/src/index.rs b/milli/src/index.rs index efc31ab46..f26643de7 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -11,8 +11,8 @@ use roaring::RoaringBitmap; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; use crate::fields_ids_map::FieldsIdsMap; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::{ default_criteria, BEU32StrCodec, BoRoaringBitmapCodec, CboRoaringBitmapCodec, Criterion, @@ -90,8 +90,9 @@ pub struct Index { /// Maps the facet field id, level and the number with the docids that corresponds to it. pub facet_id_f64_docids: Database, - /// Maps the facet field id and the string with the docids that corresponds to it. - pub facet_id_string_docids: Database, + /// Maps the facet field id and the string with the original string and docids that corresponds to it. + pub facet_id_string_docids: + Database>, /// Maps the document id, the facet field id and the numbers. pub field_id_docid_facet_f64s: Database, diff --git a/milli/src/search/distinct/facet_distinct.rs b/milli/src/search/distinct/facet_distinct.rs index d81f20732..4436d4cda 100644 --- a/milli/src/search/distinct/facet_distinct.rs +++ b/milli/src/search/distinct/facet_distinct.rs @@ -1,5 +1,6 @@ use std::mem::size_of; +use concat_arrays::concat_arrays; use heed::types::{ByteSlice, Str, Unit}; use roaring::RoaringBitmap; @@ -43,7 +44,10 @@ pub struct FacetDistinctIter<'a> { impl<'a> FacetDistinctIter<'a> { fn facet_string_docids(&self, key: &str) -> heed::Result> { - self.index.facet_id_string_docids.get(self.txn, &(self.distinct, key)) + self.index + .facet_id_string_docids + .get(self.txn, &(self.distinct, key)) + .map(|result| result.map(|(_original, docids)| docids)) } fn facet_number_docids(&self, key: f64) -> heed::Result> { @@ -116,10 +120,7 @@ impl<'a> FacetDistinctIter<'a> { } fn facet_values_prefix_key(distinct: FieldId, id: DocumentId) -> [u8; FID_SIZE + DOCID_SIZE] { - let mut key = [0; FID_SIZE + DOCID_SIZE]; - key[0..FID_SIZE].copy_from_slice(&distinct.to_be_bytes()); - key[FID_SIZE..].copy_from_slice(&id.to_be_bytes()); - key + concat_arrays!(distinct.to_be_bytes(), id.to_be_bytes()) } fn facet_number_values<'a>( diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index ae3fdb91e..e7dc52a82 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -47,7 +47,7 @@ mod test { let mut documents = Vec::new(); - let txts = ["toto", "titi", "tata"]; + let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -90,7 +90,6 @@ mod test { addition.index_documents_method(IndexDocumentsMethod::ReplaceDocuments); addition.update_format(UpdateFormat::Json); - addition.execute(JSON.to_string().as_bytes(), |_, _| ()).unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); diff --git a/milli/src/search/facet/facet_distribution.rs b/milli/src/search/facet/facet_distribution.rs index 7c9acf276..94f875dfc 100644 --- a/milli/src/search/facet/facet_distribution.rs +++ b/milli/src/search/facet/facet_distribution.rs @@ -23,7 +23,7 @@ const MAX_VALUES_BY_FACET: usize = 1000; /// Threshold on the number of candidates that will make /// the system to choose between one algorithm or another. -const CANDIDATES_THRESHOLD: u64 = 35_000; +const CANDIDATES_THRESHOLD: u64 = 3000; pub struct FacetDistribution<'a> { facets: Option>, @@ -72,6 +72,7 @@ impl<'a> FacetDistribution<'a> { FacetType::Number => { let mut key_buffer: Vec<_> = field_id.to_be_bytes().iter().copied().collect(); + let distribution_prelength = distribution.len(); let db = self.index.field_id_docid_facet_f64s; for docid in candidates.into_iter() { key_buffer.truncate(mem::size_of::()); @@ -84,6 +85,9 @@ impl<'a> FacetDistribution<'a> { for result in iter { let ((_, _, value), ()) = result?; *distribution.entry(value.to_string()).or_insert(0) += 1; + if distribution.len() - distribution_prelength == self.max_values_by_facet { + break; + } } } } @@ -106,6 +110,10 @@ impl<'a> FacetDistribution<'a> { .entry(normalized_value) .or_insert_with(|| (original_value, 0)); *count += 1; + + if normalized_distribution.len() == self.max_values_by_facet { + break; + } } } @@ -154,10 +162,10 @@ impl<'a> FacetDistribution<'a> { FacetStringIter::new_non_reducing(self.rtxn, self.index, field_id, candidates.clone())?; for result in iter { - let (value, mut docids) = result?; + let (_normalized, original, mut docids) = result?; docids &= candidates; if !docids.is_empty() { - distribution.insert(value.to_string(), docids.len()); + distribution.insert(original.to_string(), docids.len()); } if distribution.len() == self.max_values_by_facet { break; @@ -193,14 +201,20 @@ impl<'a> FacetDistribution<'a> { .prefix_iter(self.rtxn, &field_id.to_be_bytes())? .remap_key_type::(); + let mut normalized_distribution = BTreeMap::new(); for result in iter { - let ((_, value), docids) = result?; - distribution.insert(value.to_string(), docids.len()); + let ((_, normalized_value), (original_value, docids)) = result?; + normalized_distribution.insert(normalized_value, (original_value, docids.len())); if distribution.len() == self.max_values_by_facet { break; } } + let iter = normalized_distribution + .into_iter() + .map(|(_normalized, (original, count))| (original.to_string(), count)); + distribution.extend(iter); + Ok(distribution) } diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index f0b527104..40ea8c04a 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -135,7 +135,8 @@ use heed::{Database, LazyDecode, RoRange}; use roaring::RoaringBitmap; use crate::heed_codec::facet::{ - FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::{FieldId, Index}; @@ -209,7 +210,11 @@ impl<'t> Iterator for FacetStringGroupRange<'t> { /// /// It yields the facet string and the roaring bitmap associated with it. pub struct FacetStringLevelZeroRange<'t> { - iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, + iter: RoRange< + 't, + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, } impl<'t> FacetStringLevelZeroRange<'t> { @@ -252,18 +257,23 @@ impl<'t> FacetStringLevelZeroRange<'t> { let iter = db .remap_key_type::() .range(rtxn, &(left_bound, right_bound))? - .remap_types::(); + .remap_types::< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec + >(); Ok(FacetStringLevelZeroRange { iter }) } } impl<'t> Iterator for FacetStringLevelZeroRange<'t> { - type Item = heed::Result<(&'t str, RoaringBitmap)>; + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; fn next(&mut self) -> Option { match self.iter.next() { - Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))), + Some(Ok(((_fid, normalized), (original, docids)))) => { + Some(Ok((normalized, original, docids))) + } Some(Err(e)) => Some(Err(e)), None => None, } @@ -326,7 +336,7 @@ impl<'t> FacetStringIter<'t> { } impl<'t> Iterator for FacetStringIter<'t> { - type Item = heed::Result<(&'t str, RoaringBitmap)>; + type Item = heed::Result<(&'t str, &'t str, RoaringBitmap)>; fn next(&mut self) -> Option { 'outer: loop { @@ -377,11 +387,11 @@ impl<'t> Iterator for FacetStringIter<'t> { // level zero only for result in last { match result { - Ok((value, mut docids)) => { + Ok((normalized, original, mut docids)) => { docids &= &*documents_ids; if !docids.is_empty() { *documents_ids -= &docids; - return Some(Ok((value, docids))); + return Some(Ok((normalized, original, docids))); } } Err(e) => return Some(Err(e)), diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index c5ecb5a79..cc108f855 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -17,7 +17,9 @@ use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; use crate::error::UserError; -use crate::heed_codec::facet::{FacetLevelValueF64Codec, FacetStringLevelZeroCodec}; +use crate::heed_codec::facet::{ + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, +}; use crate::{CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result}; #[derive(Debug, Clone, PartialEq)] @@ -363,7 +365,10 @@ impl FilterCondition { rtxn: &heed::RoTxn, index: &Index, numbers_db: heed::Database, - strings_db: heed::Database, + strings_db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, field_id: FieldId, operator: &Operator, ) -> Result { @@ -374,7 +379,8 @@ impl FilterCondition { GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), Equal(number, string) => { - let string_docids = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); + let (_original_value, string_docids) = + strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { Some(n) => { let n = Included(*n); diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index e9c1e507a..bcb7d7580 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -9,6 +9,7 @@ use serde_json::Value; use super::ClearDocuments; use crate::error::{FieldIdMapMissingEntry, InternalError, UserError}; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::index::{db_name, main_key}; use crate::{DocumentId, ExternalDocumentsIds, FieldId, Index, Result, SmallString32, BEU32}; @@ -374,13 +375,13 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> { drop(iter); // We delete the documents ids that are under the facet field id values. - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_number_docids( self.wtxn, facet_id_f64_docids, &self.documents_ids, )?; - remove_docids_from_facet_field_id_value_docids( + remove_docids_from_facet_field_id_string_docids( self.wtxn, facet_id_string_docids, &self.documents_ids, @@ -447,7 +448,33 @@ where Ok(()) } -fn remove_docids_from_facet_field_id_value_docids<'a, C>( +fn remove_docids_from_facet_field_id_string_docids<'a, C>( + wtxn: &'a mut heed::RwTxn, + db: &heed::Database>, + to_remove: &RoaringBitmap, +) -> heed::Result<()> +where + C: heed::BytesDecode<'a> + heed::BytesEncode<'a>, +{ + let mut iter = db.remap_key_type::().iter_mut(wtxn)?; + while let Some(result) = iter.next() { + let (bytes, (original_value, mut docids)) = result?; + let previous_len = docids.len(); + docids -= to_remove; + if docids.is_empty() { + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.del_current()? }; + } else if docids.len() != previous_len { + let bytes = bytes.to_owned(); + // safety: we don't keep references from inside the LMDB database. + unsafe { iter.put_current(&bytes, &(original_value, docids))? }; + } + } + + Ok(()) +} + +fn remove_docids_from_facet_field_id_number_docids<'a, C>( wtxn: &'a mut heed::RwTxn, db: &heed::Database, to_remove: &RoaringBitmap, diff --git a/milli/src/update/facets.rs b/milli/src/update/facets.rs index d3bba6d6e..cb9a90f7e 100644 --- a/milli/src/update/facets.rs +++ b/milli/src/update/facets.rs @@ -12,7 +12,7 @@ use roaring::RoaringBitmap; use crate::error::InternalError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetLevelValueU32Codec, FacetStringLevelZeroCodec, - FacetStringZeroBoundsValueCodec, + FacetStringLevelZeroValueCodec, FacetStringZeroBoundsValueCodec, }; use crate::heed_codec::CboRoaringBitmapCodec; use crate::update::index_documents::{ @@ -75,7 +75,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { )?; // Compute and store the faceted strings documents ids. - let string_documents_ids = compute_faceted_documents_ids( + let string_documents_ids = compute_faceted_strings_documents_ids( self.wtxn, self.index.facet_id_string_docids.remap_key_type::(), field_id, @@ -96,7 +96,7 @@ impl<'t, 'u, 'i> Facets<'t, 'u, 'i> { clear_field_number_levels(self.wtxn, self.index.facet_id_f64_docids, field_id)?; // Compute and store the faceted numbers documents ids. - let number_documents_ids = compute_faceted_documents_ids( + let number_documents_ids = compute_faceted_numbers_documents_ids( self.wtxn, self.index.facet_id_f64_docids.remap_key_type::(), field_id, @@ -237,13 +237,26 @@ fn write_number_entry( Ok(()) } -fn compute_faceted_documents_ids( +fn compute_faceted_strings_documents_ids( + rtxn: &heed::RoTxn, + db: heed::Database>, + field_id: FieldId, +) -> Result { + let mut documents_ids = RoaringBitmap::new(); + for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { + let (_key, (_original_value, docids)) = result?; + documents_ids |= docids; + } + + Ok(documents_ids) +} + +fn compute_faceted_numbers_documents_ids( rtxn: &heed::RoTxn, db: heed::Database, field_id: FieldId, ) -> Result { let mut documents_ids = RoaringBitmap::new(); - for result in db.prefix_iter(rtxn, &field_id.to_be_bytes())? { let (_key, docids) = result?; documents_ids |= docids; @@ -265,7 +278,10 @@ fn clear_field_string_levels<'t>( fn compute_facet_string_levels<'t>( rtxn: &'t heed::RoTxn, - db: heed::Database, + db: heed::Database< + FacetStringLevelZeroCodec, + FacetStringLevelZeroValueCodec, + >, compression_type: CompressionType, compression_level: Option, shrink_size: Option, @@ -299,7 +315,7 @@ fn compute_facet_string_levels<'t>( // Because we know the size of the level 0 we can use a range iterator that starts // at the first value of the level and goes to the last by simply counting. for (i, result) in db.range(rtxn, &((field_id, "")..))?.take(first_level_size).enumerate() { - let ((_field_id, value), docids) = result?; + let ((_field_id, value), (_original_value, docids)) = result?; if i == 0 { left = (i as u32, value); diff --git a/milli/src/update/index_documents/merge_function.rs b/milli/src/update/index_documents/merge_function.rs index 8613a8824..7e5d0b581 100644 --- a/milli/src/update/index_documents/merge_function.rs +++ b/milli/src/update/index_documents/merge_function.rs @@ -2,8 +2,11 @@ use std::borrow::Cow; use std::result::Result as StdResult; use fst::IntoStreamer; +use heed::{BytesDecode, BytesEncode}; use roaring::RoaringBitmap; +use crate::error::SerializationError; +use crate::heed_codec::facet::FacetStringLevelZeroValueCodec; use crate::heed_codec::CboRoaringBitmapCodec; use crate::Result; @@ -69,6 +72,26 @@ pub fn roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result Ok(vec) } +/// Uses the FacetStringLevelZeroValueCodec to merge the values. +pub fn tuple_string_cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { + let (head, tail) = values.split_first().unwrap(); + let (head_string, mut head_rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&head[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + + for value in tail { + let (_string, rb) = + FacetStringLevelZeroValueCodec::::bytes_decode(&value[..]) + .ok_or(SerializationError::Decoding { db_name: None })?; + head_rb |= rb; + } + + FacetStringLevelZeroValueCodec::::bytes_encode(&(head_string, head_rb)) + .map(|cow| cow.into_owned()) + .ok_or(SerializationError::Encoding { db_name: None }) + .map_err(Into::into) +} + pub fn cbo_roaring_bitmap_merge(_key: &[u8], values: &[Cow<[u8]>]) -> Result> { let (head, tail) = values.split_first().unwrap(); let mut head = CboRoaringBitmapCodec::deserialize_from(&head[..])?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 9ac05fe1a..efe16def7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -20,6 +20,7 @@ use serde::{Deserialize, Serialize}; pub use self::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use self::store::{Readers, Store}; pub use self::transform::{Transform, TransformOutput}; @@ -655,7 +656,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> { self.wtxn, *self.index.facet_id_string_docids.as_polymorph(), facet_field_strings_docids_readers, - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, write_method, )?; diff --git a/milli/src/update/index_documents/store.rs b/milli/src/update/index_documents/store.rs index 1538295f9..444b11e31 100644 --- a/milli/src/update/index_documents/store.rs +++ b/milli/src/update/index_documents/store.rs @@ -22,12 +22,13 @@ use tempfile::tempfile; use super::merge_function::{ cbo_roaring_bitmap_merge, fst_merge, keep_first, roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, }; use super::{create_sorter, create_writer, writer_into_reader, MergeFn}; use crate::error::{Error, InternalError, SerializationError}; use crate::heed_codec::facet::{ - FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FieldDocIdFacetF64Codec, - FieldDocIdFacetStringCodec, + FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, + FieldDocIdFacetF64Codec, FieldDocIdFacetStringCodec, }; use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec}; use crate::update::UpdateIndexingStep; @@ -153,7 +154,7 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { max_memory, ); let facet_field_strings_docids_sorter = create_sorter( - cbo_roaring_bitmap_merge, + tuple_string_cbo_roaring_bitmap_merge, chunk_compression_type, chunk_compression_level, chunk_fusing_shrink_size, @@ -528,17 +529,18 @@ impl<'s, A: AsRef<[u8]>> Store<'s, A> { Error: From, { let mut key_buffer = Vec::new(); - let mut data_buffer = Vec::new(); for ((field_id, normalized_value), (original_value, docids)) in iter { key_buffer.clear(); - data_buffer.clear(); FacetStringLevelZeroCodec::serialize_into(field_id, &normalized_value, &mut key_buffer); - CboRoaringBitmapCodec::serialize_into(&docids, &mut data_buffer); + + let data = (original_value.as_str(), docids); + let data = FacetStringLevelZeroValueCodec::::bytes_encode(&data) + .ok_or(SerializationError::Encoding { db_name: Some("facet-id-string-docids") })?; if lmdb_key_valid_size(&key_buffer) { - sorter.insert(&key_buffer, &data_buffer)?; + sorter.insert(&key_buffer, &data)?; } else { warn!("facet value {:?} is too large to be saved", original_value); }