mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 23:04:26 +01:00
Introduce a lot of facet string helper iterators
This commit is contained in:
parent
851f979039
commit
a79661c6dc
52
milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
Normal file
52
milli/src/heed_codec/facet/facet_level_value_u32_codec.rs
Normal file
@ -0,0 +1,52 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::num::NonZeroU8;
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
/// A codec that stores the field id, level 1 and higher and the groups ids.
|
||||
///
|
||||
/// It can only be used to encode the facet string of the level 1 or higher.
|
||||
pub struct FacetLevelValueU32Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec {
|
||||
type DItem = (FieldId, NonZeroU8, u32, u32);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id, bytes) = bytes.split_first()?;
|
||||
let (level, bytes) = bytes.split_first()?;
|
||||
let level = NonZeroU8::new(*level)?;
|
||||
let left = bytes[16..20].try_into().ok().map(u32::from_be_bytes)?;
|
||||
let right = bytes[20..].try_into().ok().map(u32::from_be_bytes)?;
|
||||
Some((*field_id, level, left, right))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for FacetLevelValueU32Codec {
|
||||
type EItem = (FieldId, NonZeroU8, u32, u32);
|
||||
|
||||
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut buffer = [0u8; 16];
|
||||
|
||||
// Write the big-endian integers.
|
||||
let bytes = left.to_be_bytes();
|
||||
buffer[..4].copy_from_slice(&bytes[..]);
|
||||
|
||||
let bytes = right.to_be_bytes();
|
||||
buffer[4..8].copy_from_slice(&bytes[..]);
|
||||
|
||||
// Then the u32 values just to be able to read them back.
|
||||
let bytes = left.to_be_bytes();
|
||||
buffer[8..12].copy_from_slice(&bytes[..]);
|
||||
|
||||
let bytes = right.to_be_bytes();
|
||||
buffer[12..].copy_from_slice(&bytes[..]);
|
||||
|
||||
let mut bytes = Vec::with_capacity(buffer.len() + 2);
|
||||
bytes.push(*field_id);
|
||||
bytes.push(level.get());
|
||||
bytes.extend_from_slice(&buffer);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
49
milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
Normal file
49
milli/src/heed_codec/facet/facet_string_level_zero_codec.rs
Normal file
@ -0,0 +1,49 @@
|
||||
use std::borrow::Cow;
|
||||
use std::str;
|
||||
|
||||
use crate::FieldId;
|
||||
|
||||
/// A codec that stores the field id, level 0, and facet string.
|
||||
///
|
||||
/// It can only be used to encode the facet string of the level 0,
|
||||
/// as it hardcodes the level.
|
||||
///
|
||||
/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys,
|
||||
/// and make sure that the levels are not mixed-up. The level 0 is special, the key
|
||||
/// are strings, other levels represent groups and keys are simply two integers.
|
||||
pub struct FacetStringLevelZeroCodec;
|
||||
|
||||
impl FacetStringLevelZeroCodec {
|
||||
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
|
||||
out.reserve(value.len() + 2);
|
||||
out.push(field_id);
|
||||
out.push(0); // the level zero (for LMDB ordering only)
|
||||
out.extend_from_slice(value.as_bytes());
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec {
|
||||
type DItem = (FieldId, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (field_id, bytes) = bytes.split_first()?;
|
||||
let (level, bytes) = bytes.split_first()?;
|
||||
|
||||
if *level != 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
let value = str::from_utf8(bytes).ok()?;
|
||||
Some((*field_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec {
|
||||
type EItem = (FieldId, &'a str);
|
||||
|
||||
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::new();
|
||||
FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
@ -0,0 +1,80 @@
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::{marker, str};
|
||||
|
||||
/// A codec that encodes two strings in front of the value.
|
||||
///
|
||||
/// The usecase is for the facet string levels algorithm where we must
|
||||
/// know the origin of a group, the group left and right bounds are stored
|
||||
/// in the value to not break the lexicographical ordering of the LMDB keys.
|
||||
pub struct FacetStringZeroBoundsValueCodec<C>(marker::PhantomData<C>);
|
||||
|
||||
impl<'a, C> heed::BytesDecode<'a> for FacetStringZeroBoundsValueCodec<C>
|
||||
where
|
||||
C: heed::BytesDecode<'a>,
|
||||
{
|
||||
type DItem = (Option<(&'a str, &'a str)>, C::DItem);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
|
||||
let (contains_bounds, tail_bytes) = bytes.split_first()?;
|
||||
|
||||
if *contains_bounds != 0 {
|
||||
let (left_len, bytes) = try_split_at(bytes, 2)?;
|
||||
let (right_len, bytes) = try_split_at(bytes, 2)?;
|
||||
|
||||
let left_len = left_len.try_into().ok().map(u16::from_be_bytes)?;
|
||||
let right_len = right_len.try_into().ok().map(u16::from_be_bytes)?;
|
||||
|
||||
let (left, bytes) = try_split_at(bytes, left_len as usize)?;
|
||||
let (right, bytes) = try_split_at(bytes, right_len as usize)?;
|
||||
|
||||
let left = str::from_utf8(left).ok()?;
|
||||
let right = str::from_utf8(right).ok()?;
|
||||
|
||||
C::bytes_decode(bytes).map(|item| (Some((left, right)), item))
|
||||
} else {
|
||||
C::bytes_decode(tail_bytes).map(|item| (None, item))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C> heed::BytesEncode<'a> for FacetStringZeroBoundsValueCodec<C>
|
||||
where
|
||||
C: heed::BytesEncode<'a>,
|
||||
{
|
||||
type EItem = (Option<(&'a str, &'a str)>, C::EItem);
|
||||
|
||||
fn bytes_encode((bounds, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
|
||||
let mut bytes = Vec::new();
|
||||
|
||||
match bounds {
|
||||
Some((left, right)) => {
|
||||
let left_len: u16 = left.len().try_into().ok()?;
|
||||
let right_len: u16 = right.len().try_into().ok()?;
|
||||
bytes.extend_from_slice(&left_len.to_be_bytes());
|
||||
bytes.extend_from_slice(&right_len.to_be_bytes());
|
||||
|
||||
let value_bytes = C::bytes_encode(&value)?;
|
||||
bytes.extend_from_slice(&value_bytes[..]);
|
||||
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
None => {
|
||||
bytes.push(0);
|
||||
let value_bytes = C::bytes_encode(&value)?;
|
||||
bytes.extend_from_slice(&value_bytes[..]);
|
||||
Some(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to split a slice in half at the given middle point,
|
||||
/// `None` if the slice is too short.
|
||||
fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
||||
if slice.len() >= mid {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
@ -1,9 +1,15 @@
|
||||
mod facet_level_value_f64_codec;
|
||||
mod facet_level_value_u32_codec;
|
||||
mod facet_string_level_zero_codec;
|
||||
mod facet_string_zero_bounds_value_codec;
|
||||
mod facet_value_string_codec;
|
||||
mod field_doc_id_facet_f64_codec;
|
||||
mod field_doc_id_facet_string_codec;
|
||||
|
||||
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
|
||||
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
|
||||
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
|
||||
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
|
||||
pub use self::facet_value_string_codec::FacetValueStringCodec;
|
||||
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
|
||||
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
|
||||
|
@ -31,7 +31,7 @@
|
||||
//!
|
||||
//! ### Example of what a facet number LMDB database contain
|
||||
//!
|
||||
//! | level | left-bound | right-bound | docs |
|
||||
//! | level | left-bound | right-bound | documents ids |
|
||||
//! |-------|------------|-------------|------------------|
|
||||
//! | 0 | 0 | _skipped_ | 1, 2 |
|
||||
//! | 0 | 1 | _skipped_ | 6, 7 |
|
||||
@ -48,7 +48,7 @@
|
||||
//! The next levels have two different bounds and the associated documents ids are simply the result
|
||||
//! of an union of all the documents ids associated with the aggregated groups above.
|
||||
//!
|
||||
//! ## The complexity of defining groups of facet strings
|
||||
//! ## The complexity of defining groups for facet strings
|
||||
//!
|
||||
//! As explained above, defining groups of facet numbers is easy, LMDB stores the keys in
|
||||
//! lexicographical order, it means that whatever the key represent the bytes are read in their raw
|
||||
@ -77,22 +77,25 @@
|
||||
//!
|
||||
//! #### Example of facet strings with numbered groups
|
||||
//!
|
||||
//! | level | left-bound | right-bound | left-string | right-string | docs |
|
||||
//! | level | left-bound | right-bound | left-string | right-string | documents ids |
|
||||
//! |-------|------------|-------------|-------------|--------------|------------------|
|
||||
//! | 0 | alpha | _skipped_ | _skipped_ | _skipped_ | 1, 2 |
|
||||
//! | 0 | beta | _skipped_ | _skipped_ | _skipped_ | 6, 7 |
|
||||
//! | 0 | gamma | _skipped_ | _skipped_ | _skipped_ | 4, 7 |
|
||||
//! | 0 | omega | _skipped_ | _skipped_ | _skipped_ | 2, 3, 4 |
|
||||
//! | 1 | 0 | 1 | alpha | beta | 1, 2, 6, 7 |
|
||||
//! | 1 | 3 | 5 | gamma | omega | 2, 3, 4, 7 |
|
||||
//! | 2 | 0 | 5 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 |
|
||||
//! | 1 | 2 | 3 | gamma | omega | 2, 3, 4, 7 |
|
||||
//! | 2 | 0 | 3 | _skipped_ | _skipped_ | 1, 2, 3, 4, 6, 7 |
|
||||
//!
|
||||
//! As you can see the level 0 doesn't actually change much, we skip nearly everything, we do not
|
||||
//! need to store the facet string value two times.
|
||||
//!
|
||||
//! In the value, not in the key, you can see that we added two new values:
|
||||
//! the left-string and the right-string, which defines the original facet strings associated with
|
||||
//! the given group.
|
||||
//! The number in the left-bound and right-bound columns are incremental numbers representing the
|
||||
//! level 0 strings, .i.e. alpha is 0, beta is 1. Those numbers are just here to keep the ordering
|
||||
//! of the LMDB keys.
|
||||
//!
|
||||
//! In the value, not in the key, you can see that we added two new values: the left-string and the
|
||||
//! right-string, which defines the original facet strings associated with the given group.
|
||||
//!
|
||||
//! We put those two strings inside of the value, this way we do not limit the maximum size of the
|
||||
//! facet string values, and the impact on performances is not important as, IIRC, LMDB put big
|
||||
@ -121,3 +124,124 @@
|
||||
//! If the group doesn't contain one of our documents ids, we continue to the next group at this
|
||||
//! same level.
|
||||
//!
|
||||
|
||||
use std::num::NonZeroU8;
|
||||
use std::ops::Bound;
|
||||
use std::ops::Bound::{Excluded, Included};
|
||||
|
||||
use heed::types::{ByteSlice, Str};
|
||||
use heed::{Database, LazyDecode, RoRange};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::facet::{
|
||||
FacetLevelValueU32Codec, FacetStringLevelZeroCodec, FacetStringZeroBoundsValueCodec,
|
||||
};
|
||||
use crate::heed_codec::CboRoaringBitmapCodec;
|
||||
use crate::FieldId;
|
||||
|
||||
/// An iterator that is used to explore the facets level strings
|
||||
/// from the level 1 to infinity.
|
||||
///
|
||||
/// It yields the level, group id that an entry covers, the optional group strings
|
||||
/// that it covers of the level 0 only if it is an entry from the level 1 and
|
||||
/// the roaring bitmap associated.
|
||||
pub struct FacetStringGroupRange<'t> {
|
||||
iter: RoRange<
|
||||
't,
|
||||
FacetLevelValueU32Codec,
|
||||
LazyDecode<FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>>,
|
||||
>,
|
||||
end: Bound<u32>,
|
||||
}
|
||||
|
||||
impl<'t> FacetStringGroupRange<'t> {
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<
|
||||
FacetLevelValueU32Codec,
|
||||
FacetStringZeroBoundsValueCodec<CboRoaringBitmapCodec>,
|
||||
>,
|
||||
field_id: FieldId,
|
||||
level: NonZeroU8,
|
||||
left: Bound<u32>,
|
||||
right: Bound<u32>,
|
||||
) -> heed::Result<FacetStringGroupRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, level, left, u32::MIN)),
|
||||
Excluded(left) => Excluded((field_id, level, left, u32::MIN)),
|
||||
Unbounded => Included((field_id, level, u32::MIN, u32::MIN)),
|
||||
};
|
||||
let right_bound = Included((field_id, level, u32::MAX, u32::MAX));
|
||||
let iter = db.lazily_decode_data().range(rtxn, &(left_bound, right_bound))?;
|
||||
Ok(FacetStringGroupRange { iter, end: right })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetStringGroupRange<'t> {
|
||||
type Item = heed::Result<((NonZeroU8, u32, u32), (Option<(&'t str, &'t str)>, RoaringBitmap))>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((_fid, level, left, right), docids))) => {
|
||||
let must_be_returned = match self.end {
|
||||
Included(end) => right <= end,
|
||||
Excluded(end) => right < end,
|
||||
Unbounded => true,
|
||||
};
|
||||
if must_be_returned {
|
||||
match docids.decode() {
|
||||
Ok(docids) => Some(Ok(((level, left, right), docids))),
|
||||
Err(e) => Some(Err(e)),
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An iterator that is used to explore the level 0 of the facets string database.
|
||||
///
|
||||
/// It yields the facet string and the roaring bitmap associated with it.
|
||||
pub struct FacetStringLevelZeroRange<'t> {
|
||||
iter: RoRange<'t, FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
|
||||
}
|
||||
|
||||
impl<'t> FacetStringLevelZeroRange<'t> {
|
||||
pub fn new(
|
||||
rtxn: &'t heed::RoTxn,
|
||||
db: Database<FacetStringLevelZeroCodec, CboRoaringBitmapCodec>,
|
||||
field_id: FieldId,
|
||||
left: Bound<&str>,
|
||||
right: Bound<&str>,
|
||||
) -> heed::Result<FacetStringLevelZeroRange<'t>> {
|
||||
let left_bound = match left {
|
||||
Included(left) => Included((field_id, left)),
|
||||
Excluded(left) => Excluded((field_id, left)),
|
||||
Unbounded => Included((field_id, "")),
|
||||
};
|
||||
|
||||
let right_bound = match right {
|
||||
Included(right) => Included((field_id, right)),
|
||||
Excluded(right) => Excluded((field_id, right)),
|
||||
Unbounded => Excluded((field_id + 1, "")),
|
||||
};
|
||||
|
||||
db.range(rtxn, &(left_bound, right_bound)).map(|iter| FacetStringLevelZeroRange { iter })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'t> Iterator for FacetStringLevelZeroRange<'t> {
|
||||
type Item = heed::Result<(&'t str, RoaringBitmap)>;
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
match self.iter.next() {
|
||||
Some(Ok(((_fid, value), docids))) => Some(Ok((value, docids))),
|
||||
Some(Err(e)) => Some(Err(e)),
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user