From a79661c6dc7fc605f0a4a0a87c7af73941ffb447 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 23 Jun 2021 15:53:28 +0200 Subject: [PATCH] Introduce a lot of facet string helper iterators --- .../facet/facet_level_value_u32_codec.rs | 52 +++++++ .../facet/facet_string_level_zero_codec.rs | 49 ++++++ .../facet_string_zero_bounds_value_codec.rs | 80 ++++++++++ milli/src/heed_codec/facet/mod.rs | 6 + milli/src/search/facet/facet_string.rs | 140 +++++++++++++++++- 5 files changed, 319 insertions(+), 8 deletions(-) create mode 100644 milli/src/heed_codec/facet/facet_level_value_u32_codec.rs create mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_codec.rs create mode 100644 milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs new file mode 100644 index 000000000..6b51b306e --- /dev/null +++ b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs @@ -0,0 +1,52 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::num::NonZeroU8; + +use crate::FieldId; + +/// A codec that stores the field id, level 1 and higher and the groups ids. +/// +/// It can only be used to encode the facet string of the level 1 or higher. +pub struct FacetLevelValueU32Codec; + +impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec { + type DItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + let level = NonZeroU8::new(*level)?; + let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?; + let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?; + Some((*field_id, level, left, right)) + } +} + +impl heed::BytesEncode<'_> for FacetLevelValueU32Codec { + type EItem = (FieldId, NonZeroU8, u32, u32); + + fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // Write the big-endian integers. + let bytes = left.to_be_bytes(); + buffer[..4].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[4..8].copy_from_slice(&bytes[..]); + + // Then the u32 values just to be able to read them back. + let bytes = left.to_be_bytes(); + buffer[8..12].copy_from_slice(&bytes[..]); + + let bytes = right.to_be_bytes(); + buffer[12..].copy_from_slice(&bytes[..]); + + let mut bytes = Vec::with_capacity(buffer.len() + 2); + bytes.push(*field_id); + bytes.push(level.get()); + bytes.extend_from_slice(&buffer); + + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs new file mode 100644 index 000000000..1c0c4be93 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs @@ -0,0 +1,49 @@ +use std::borrow::Cow; +use std::str; + +use crate::FieldId; + +/// A codec that stores the field id, level 0, and facet string. +/// +/// It can only be used to encode the facet string of the level 0, +/// as it hardcodes the level. +/// +/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys, +/// and make sure that the levels are not mixed-up. The level 0 is special, the key +/// are strings, other levels represent groups and keys are simply two integers. +pub struct FacetStringLevelZeroCodec; + +impl FacetStringLevelZeroCodec { + pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec) { + out.reserve(value.len() + 2); + out.push(field_id); + out.push(0); // the level zero (for LMDB ordering only) + out.extend_from_slice(value.as_bytes()); + } +} + +impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec { + type DItem = (FieldId, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (field_id, bytes) = bytes.split_first()?; + let (level, bytes) = bytes.split_first()?; + + if *level != 0 { + return None; + } + + let value = str::from_utf8(bytes).ok()?; + Some((*field_id, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec { + type EItem = (FieldId, &'a str); + + fn bytes_encode((field_id, value): &Self::EItem) -> Option> { + let mut bytes = Vec::new(); + FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes); + Some(Cow::Owned(bytes)) + } +} diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs new file mode 100644 index 000000000..3c2ce4657 --- /dev/null +++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs @@ -0,0 +1,80 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::{marker, str}; + +/// A codec that encodes two strings in front of the value. +/// +/// The usecase is for the facet string levels algorithm where we must +/// know the origin of a group, the group left and right bounds are stored +/// in the value to not break the lexicographical ordering of the LMDB keys. +pub struct FacetStringZeroBoundsValueCodec(marker::PhantomData); + +impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesDecode<'a>, +{ + type DItem = (Option<(&'a str, &'a str)>, C::DItem); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (contains_bounds, tail_bytes) = bytes.split_first()?; + + if *contains_bounds != 0 { + let (left_len, bytes) = try_split_at(bytes, 2)?; + let (right_len, bytes) = try_split_at(bytes, 2)?; + + let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?; + let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?; + + let (left, bytes) = try_split_at(bytes, left_len as usize)?; + let (right, bytes) = try_split_at(bytes, right_len as usize)?; + + let left = str::from_utf8(left).ok()?; + let right = str::from_utf8(right).ok()?; + + C::bytes_decode(bytes).map(|item| (Some((left, right)), item)) + } else { + C::bytes_decode(tail_bytes).map(|item| (None, item)) + } + } +} + +impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec +where + C: heed::BytesEncode<'a>, +{ + type EItem = (Option<(&'a str, &'a str)>, C::EItem); + + fn bytes_encode((bounds, value): &'a Self::EItem) -> Option> { + let mut bytes = Vec::new(); + + match bounds { + Some((left, right)) => { + let left_len: u16 = left.len().try_into().ok()?; + let right_len: u16 = right.len().try_into().ok()?; + bytes.extend_from_slice(&left_len.to_be_bytes()); + bytes.extend_from_slice(&right_len.to_be_bytes()); + + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + + Some(Cow::Owned(bytes)) + } + None => { + bytes.push(0); + let value_bytes = C::bytes_encode(&value)?; + bytes.extend_from_slice(&value_bytes[..]); + Some(Cow::Owned(bytes)) + } + } + } +} + +/// Tries to split a slice in half at the given middle point, +/// `None` if the slice is too short. +fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { + if slice.len() >= mid { + Some(slice.split_at(mid)) + } else { + None + } +} diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs index 532da12fa..90dc79134 100644 --- a/milli/src/heed_codec/facet/mod.rs +++ b/milli/src/heed_codec/facet/mod.rs @@ -1,9 +1,15 @@ mod facet_level_value_f64_codec; +mod facet_level_value_u32_codec; +mod facet_string_level_zero_codec; +mod facet_string_zero_bounds_value_codec; mod facet_value_string_codec; mod field_doc_id_facet_f64_codec; mod field_doc_id_facet_string_codec; pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec; +pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec; +pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec; +pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec; pub use self::facet_value_string_codec::FacetValueStringCodec; pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec; pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec; diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs index 61fc32f8e..d4d85153f 100644 --- a/milli/src/search/facet/facet_string.rs +++ b/milli/src/search/facet/facet_string.rs @@ -31,7 +31,7 @@ //! //! ### Example of what a facet number LMDB database contain //! -//! | level | left-bound | right-bound | docs | +//! | level | left-bound | right-bound | documents ids | //! |-------|------------|-------------|------------------| //! | 0 | 0 | _skipped_ | 1, 2 | //! | 0 | 1 | _skipped_ | 6, 7 | @@ -48,7 +48,7 @@ //! The next levels have two different bounds and the associated documents ids are simply the result //! of an union of all the documents ids associated with the aggregated groups above. //! -//! ## The complexity of defining groups of facet strings +//! ## The complexity of defining groups for facet strings //! //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in //! lexicographical order, it means that whatever the key represent the bytes are read in their raw @@ -77,22 +77,25 @@ //! //! #### Example of facet strings with numbered groups //! -//! | level | left-bound | right-bound | left-string | right-string | docs | +//! | level | left-bound | right-bound | left-string | right-string | documents ids | //! |-------|------------|-------------|-------------|--------------|------------------| //! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 | //! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 | //! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 | //! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 | //! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 | -//! | 1 | 3 | 5 | gamma | omega | 2, 3, 4, 7 | -//! | 2 | 0 | 5 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | +//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 | +//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 | //! //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not //! need to store the facet string value two times. //! -//! In the value, not in the key, you can see that we added two new values: -//! the left-string and the right-string, which defines the original facet strings associated with -//! the given group. +//! The number in the left-bound and right-bound columns are incremental numbers representing the +//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering +//! of the LMDB keys. +//! +//! In the value, not in the key, you can see that we added two new values: the left-string and the +//! right-string, which defines the original facet strings associated with the given group. //! //! We put those two strings inside of the value, this way we do not limit the maximum size of the //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big @@ -121,3 +124,124 @@ //! If the group doesn't contain one of our documents ids, we continue to the next group at this //! same level. //! + +use std::num::NonZeroU8; +use std::ops::Bound; +use std::ops::Bound::{Excluded, Included}; + +use heed::types::{ByteSlice, Str}; +use heed::{Database, LazyDecode, RoRange}; +use roaring::RoaringBitmap; + +use crate::heed_codec::facet::{ + FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec, +}; +use crate::heed_codec::CboRoaringBitmapCodec; +use crate::FieldId; + +/// An iterator that is used to explore the facets level strings +/// from the level 1 to infinity. +/// +/// It yields the level, group id that an entry covers, the optional group strings +/// that it covers of the level 0 only if it is an entry from the level 1 and +/// the roaring bitmap associated. +pub struct FacetStringGroupRange<'t> { + iter: RoRange< + 't, + FacetLevelValueU32Codec, + LazyDecode>, + >, + end: Bound, +} + +impl<'t> FacetStringGroupRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database< + FacetLevelValueU32Codec, + FacetStringZeroBoundsValueCodec, + >, + field_id: FieldId, + level: NonZeroU8, + left: Bound, + right: Bound, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, level, left, u32::MIN)), + Excluded(left) => Excluded((field_id, level, left, u32::MIN)), + Unbounded => Included((field_id, level, u32::MIN, u32::MIN)), + }; + let right_bound = Included((field_id, level, u32::MAX, u32::MAX)); + let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?; + Ok(FacetStringGroupRange { iter, end: right }) + } +} + +impl<'t> Iterator for FacetStringGroupRange<'t> { + type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, level, left, right), docids))) => { + let must_be_returned = match self.end { + Included(end) => right <= end, + Excluded(end) => right < end, + Unbounded => true, + }; + if must_be_returned { + match docids.decode() { + Ok(docids) => Some(Ok(((level, left, right), docids))), + Err(e) => Some(Err(e)), + } + } else { + None + } + } + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} + +/// An iterator that is used to explore the level 0 of the facets string database. +/// +/// It yields the facet string and the roaring bitmap associated with it. +pub struct FacetStringLevelZeroRange<'t> { + iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>, +} + +impl<'t> FacetStringLevelZeroRange<'t> { + pub fn new( + rtxn: &'t heed::RoTxn, + db: Database, + field_id: FieldId, + left: Bound<&str>, + right: Bound<&str>, + ) -> heed::Result> { + let left_bound = match left { + Included(left) => Included((field_id, left)), + Excluded(left) => Excluded((field_id, left)), + Unbounded => Included((field_id, "")), + }; + + let right_bound = match right { + Included(right) => Included((field_id, right)), + Excluded(right) => Excluded((field_id, right)), + Unbounded => Excluded((field_id + 1, "")), + }; + + db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter }) + } +} + +impl<'t> Iterator for FacetStringLevelZeroRange<'t> { + type Item = heed::Result<(&'t str, RoaringBitmap)>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))), + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +}