From a79661c6dc7fc605f0a4a0a87c7af73941ffb447 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 23 Jun 2021 15:53:28 +0200
Subject: [PATCH] Introduce a lot of facet string helper iterators

---
 .../facet/facet_level_value_u32_codec.rs      |  52 +++++++
 .../facet/facet_string_level_zero_codec.rs    |  49 ++++++
 .../facet_string_zero_bounds_value_codec.rs   |  80 ++++++++++
 milli/src/heed_codec/facet/mod.rs             |   6 +
 milli/src/search/facet/facet_string.rs        | 140 +++++++++++++++++-
 5 files changed, 319 insertions(+), 8 deletions(-)
 create mode 100644 milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
 create mode 100644 milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
 create mode 100644 milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs
diff --git a/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
new file mode 100644
index 000000000..6b51b306e
--- /dev/null
+++ b/milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
@@ -0,0 +1,52 @@
+use std::borrow::Cow;
+use std::convert::TryInto;
+use std::num::NonZeroU8;
+
+use crate::FieldId;
+
+/// A codec that stores the field id, level 1 and higher and the groups ids.
+///
+/// It can only be used to encode the facet string of the level 1 or higher.
+pub struct FacetLevelValueU32Codec;
+
+impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec {
+    type DItem = (FieldId, NonZeroU8, u32, u32);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let (field_id, bytes) = bytes.split_first()?;
+        let (level, bytes) = bytes.split_first()?;
+        let level = NonZeroU8::new(*level)?;
+        let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?;
+        let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?;
+        Some((*field_id, level, left, right))
+    }
+}
+
+impl heed::BytesEncode<'_> for FacetLevelValueU32Codec {
+    type EItem = (FieldId, NonZeroU8, u32, u32);
+
+    fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
+        let mut buffer = [0u8; 16];
+
+        // Write the big-endian integers.
+        let bytes = left.to_be_bytes();
+        buffer[..4].copy_from_slice(&bytes[..]);
+
+        let bytes = right.to_be_bytes();
+        buffer[4..8].copy_from_slice(&bytes[..]);
+
+        // Then the u32 values just to be able to read them back.
+        let bytes = left.to_be_bytes();
+        buffer[8..12].copy_from_slice(&bytes[..]);
+
+        let bytes = right.to_be_bytes();
+        buffer[12..].copy_from_slice(&bytes[..]);
+
+        let mut bytes = Vec::with_capacity(buffer.len() + 2);
+        bytes.push(*field_id);
+        bytes.push(level.get());
+        bytes.extend_from_slice(&buffer);
+
+        Some(Cow::Owned(bytes))
+    }
+}
diff --git a/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
new file mode 100644
index 000000000..1c0c4be93
--- /dev/null
+++ b/milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
@@ -0,0 +1,49 @@
+use std::borrow::Cow;
+use std::str;
+
+use crate::FieldId;
+
+/// A codec that stores the field id, level 0, and facet string.
+///
+/// It can only be used to encode the facet string of the level 0,
+/// as it hardcodes the level.
+///
+/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys,
+/// and make sure that the levels are not mixed-up. The level 0 is special, the key
+/// are strings, other levels represent groups and keys are simply two integers.
+pub struct FacetStringLevelZeroCodec;
+
+impl FacetStringLevelZeroCodec {
+    pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
+        out.reserve(value.len() + 2);
+        out.push(field_id);
+        out.push(0); // the level zero (for LMDB ordering only)
+        out.extend_from_slice(value.as_bytes());
+    }
+}
+
+impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec {
+    type DItem = (FieldId, &'a str);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let (field_id, bytes) = bytes.split_first()?;
+        let (level, bytes) = bytes.split_first()?;
+
+        if *level != 0 {
+            return None;
+        }
+
+        let value = str::from_utf8(bytes).ok()?;
+        Some((*field_id, value))
+    }
+}
+
+impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec {
+    type EItem = (FieldId, &'a str);
+
+    fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
+        let mut bytes = Vec::new();
+        FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes);
+        Some(Cow::Owned(bytes))
+    }
+}
diff --git a/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs
new file mode 100644
index 000000000..3c2ce4657
--- /dev/null
+++ b/milli/src/heed_codec/facet/facet_string_zero_bounds_value_codec.rs
@@ -0,0 +1,80 @@
+use std::borrow::Cow;
+use std::convert::TryInto;
+use std::{marker, str};
+
+/// A codec that encodes two strings in front of the value.
+///
+/// The usecase is for the facet string levels algorithm where we must
+/// know the origin of a group, the group left and right bounds are stored
+/// in the value to not break the lexicographical ordering of the LMDB keys.
+pub struct FacetStringZeroBoundsValueCodec<C>(marker::PhantomData<C>);
+
+impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec<C>
+where
+    C: heed::BytesDecode<'a>,
+{
+    type DItem = (Option<(&'a str, &'a str)>, C::DItem);
+
+    fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
+        let (contains_bounds, tail_bytes) = bytes.split_first()?;
+
+        if *contains_bounds != 0 {
+            let (left_len, bytes) = try_split_at(bytes, 2)?;
+            let (right_len, bytes) = try_split_at(bytes, 2)?;
+
+            let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?;
+            let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?;
+
+            let (left, bytes) = try_split_at(bytes, left_len as usize)?;
+            let (right, bytes) = try_split_at(bytes, right_len as usize)?;
+
+            let left = str::from_utf8(left).ok()?;
+            let right = str::from_utf8(right).ok()?;
+
+            C::bytes_decode(bytes).map(|item| (Some((left, right)), item))
+        } else {
+            C::bytes_decode(tail_bytes).map(|item| (None, item))
+        }
+    }
+}
+
+impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec<C>
+where
+    C: heed::BytesEncode<'a>,
+{
+    type EItem = (Option<(&'a str, &'a str)>, C::EItem);
+
+    fn bytes_encode((bounds, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
+        let mut bytes = Vec::new();
+
+        match bounds {
+            Some((left, right)) => {
+                let left_len: u16 = left.len().try_into().ok()?;
+                let right_len: u16 = right.len().try_into().ok()?;
+                bytes.extend_from_slice(&left_len.to_be_bytes());
+                bytes.extend_from_slice(&right_len.to_be_bytes());
+
+                let value_bytes = C::bytes_encode(&value)?;
+                bytes.extend_from_slice(&value_bytes[..]);
+
+                Some(Cow::Owned(bytes))
+            }
+            None => {
+                bytes.push(0);
+                let value_bytes = C::bytes_encode(&value)?;
+                bytes.extend_from_slice(&value_bytes[..]);
+                Some(Cow::Owned(bytes))
+            }
+        }
+    }
+}
+
+/// Tries to split a slice in half at the given middle point,
+/// `None` if the slice is too short.
+fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
+    if slice.len() >= mid {
+        Some(slice.split_at(mid))
+    } else {
+        None
+    }
+}
diff --git a/milli/src/heed_codec/facet/mod.rs b/milli/src/heed_codec/facet/mod.rs
index 532da12fa..90dc79134 100644
--- a/milli/src/heed_codec/facet/mod.rs
+++ b/milli/src/heed_codec/facet/mod.rs
@@ -1,9 +1,15 @@
 mod facet_level_value_f64_codec;
+mod facet_level_value_u32_codec;
+mod facet_string_level_zero_codec;
+mod facet_string_zero_bounds_value_codec;
 mod facet_value_string_codec;
 mod field_doc_id_facet_f64_codec;
 mod field_doc_id_facet_string_codec;
 
 pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
+pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
+pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
+pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
 pub use self::facet_value_string_codec::FacetValueStringCodec;
 pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
 pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
diff --git a/milli/src/search/facet/facet_string.rs b/milli/src/search/facet/facet_string.rs
index 61fc32f8e..d4d85153f 100644
--- a/milli/src/search/facet/facet_string.rs
+++ b/milli/src/search/facet/facet_string.rs
@@ -31,7 +31,7 @@
 //!
 //! ### Example of what a facet number LMDB database contain
 //!
-//! | level | left-bound | right-bound | docs             |
+//! | level | left-bound | right-bound | documents ids    |
 //! |-------|------------|-------------|------------------|
 //! | 0     | 0          | _skipped_   | 1, 2             |
 //! | 0     | 1          | _skipped_   | 6, 7             |
@@ -48,7 +48,7 @@
 //! The next levels have two different bounds and the associated documents ids are simply the result
 //! of an union of all the documents ids associated with the aggregated groups above.
 //!
-//! ## The complexity of defining groups of facet strings
+//! ## The complexity of defining groups for facet strings
 //!
 //! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in
 //! lexicographical order, it means that whatever the key represent the bytes are read in their raw
@@ -77,22 +77,25 @@
 //!
 //! #### Example of facet strings with numbered groups
 //!
-//! | level | left-bound | right-bound | left-string | right-string | docs             |
+//! | level | left-bound | right-bound | left-string | right-string | documents ids    |
 //! |-------|------------|-------------|-------------|--------------|------------------|
 //! | 0     | alpha      | _skipped_   | _skipped_   | _skipped_    | 1, 2             |
 //! | 0     | beta       | _skipped_   | _skipped_   | _skipped_    | 6, 7             |
 //! | 0     | gamma      | _skipped_   | _skipped_   | _skipped_    | 4, 7             |
 //! | 0     | omega      | _skipped_   | _skipped_   | _skipped_    | 2, 3, 4          |
 //! | 1     | 0          | 1           | alpha       | beta         | 1, 2, 6, 7       |
-//! | 1     | 3          | 5           | gamma       | omega        | 2, 3, 4, 7       |
-//! | 2     | 0          | 5           | _skipped_   | _skipped_    | 1, 2, 3, 4, 6, 7 |
+//! | 1     | 2          | 3           | gamma       | omega        | 2, 3, 4, 7       |
+//! | 2     | 0          | 3           | _skipped_   | _skipped_    | 1, 2, 3, 4, 6, 7 |
 //!
 //! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not
 //! need to store the facet string value two times.
 //!
-//! In the value, not in the key, you can see that we added two new values:
-//! the left-string and the right-string, which defines the original facet strings associated with
-//! the given group.
+//! The number in the left-bound and right-bound columns are incremental numbers representing the
+//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering
+//! of the LMDB keys.
+//!
+//! In the value, not in the key, you can see that we added two new values: the left-string and the
+//! right-string, which defines the original facet strings associated with the given group.
 //!
 //! We put those two strings inside of the value, this way we do not limit the maximum size of the
 //! facet string values, and the impact on performances is not important as, IIRC, LMDB put big
@@ -121,3 +124,124 @@
 //! If the group doesn't contain one of our documents ids, we continue to the next group at this
 //! same level.
 //!
+
+use std::num::NonZeroU8;
+use std::ops::Bound;
+use std::ops::Bound::{Excluded, Included};
+
+use heed::types::{ByteSlice, Str};
+use heed::{Database, LazyDecode, RoRange};
+use roaring::RoaringBitmap;
+
+use crate::heed_codec::facet::{
+    FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec,
+};
+use crate::heed_codec::CboRoaringBitmapCodec;
+use crate::FieldId;
+
+/// An iterator that is used to explore the facets level strings
+/// from the level 1 to infinity.
+///
+/// It yields the level, group id that an entry covers, the optional group strings
+/// that it covers of the level 0 only if it is an entry from the level 1 and
+/// the roaring bitmap associated.
+pub struct FacetStringGroupRange<'t> {
+    iter: RoRange<
+        't,
+        FacetLevelValueU32Codec,
+        LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
+    >,
+    end: Bound<u32>,
+}
+
+impl<'t> FacetStringGroupRange<'t> {
+    pub fn new(
+        rtxn: &'t heed::RoTxn,
+        db: Database<
+            FacetLevelValueU32Codec,
+            FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
+        >,
+        field_id: FieldId,
+        level: NonZeroU8,
+        left: Bound<u32>,
+        right: Bound<u32>,
+    ) -> heed::Result<FacetStringGroupRange<'t>> {
+        let left_bound = match left {
+            Included(left) => Included((field_id, level, left, u32::MIN)),
+            Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
+            Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
+        };
+        let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
+        let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?;
+        Ok(FacetStringGroupRange { iter, end: right })
+    }
+}
+
+impl<'t> Iterator for FacetStringGroupRange<'t> {
+    type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.iter.next() {
+            Some(Ok(((_fid, level, left, right), docids))) => {
+                let must_be_returned = match self.end {
+                    Included(end) => right <= end,
+                    Excluded(end) => right < end,
+                    Unbounded => true,
+                };
+                if must_be_returned {
+                    match docids.decode() {
+                        Ok(docids) => Some(Ok(((level, left, right), docids))),
+                        Err(e) => Some(Err(e)),
+                    }
+                } else {
+                    None
+                }
+            }
+            Some(Err(e)) => Some(Err(e)),
+            None => None,
+        }
+    }
+}
+
+/// An iterator that is used to explore the level 0 of the facets string database.
+///
+/// It yields the facet string and the roaring bitmap associated with it.
+pub struct FacetStringLevelZeroRange<'t> {
+    iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
+}
+
+impl<'t> FacetStringLevelZeroRange<'t> {
+    pub fn new(
+        rtxn: &'t heed::RoTxn,
+        db: Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
+        field_id: FieldId,
+        left: Bound<&str>,
+        right: Bound<&str>,
+    ) -> heed::Result<FacetStringLevelZeroRange<'t>> {
+        let left_bound = match left {
+            Included(left) => Included((field_id, left)),
+            Excluded(left) => Excluded((field_id, left)),
+            Unbounded => Included((field_id, "")),
+        };
+
+        let right_bound = match right {
+            Included(right) => Included((field_id, right)),
+            Excluded(right) => Excluded((field_id, right)),
+            Unbounded => Excluded((field_id + 1, "")),
+        };
+
+        db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter })
+    }
+}
+
+impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
+    type Item = heed::Result<(&'t str, RoaringBitmap)>;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        match self.iter.next() {
+            Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))),
+            Some(Err(e)) => Some(Err(e)),
+            None => None,
+        }
+    }
+}