Prepare refactor of facets database

Prepare refactor of facets database
This commit is contained in:
Loïc Lecrenier 2022-08-29 16:01:54 +02:00 committed by Loïc Lecrenier
parent 004c09a8e2
commit c3f49f766d
27 changed files with 1662 additions and 1624 deletions

View file

@ -1,89 +0,0 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes;
use crate::{try_split_array_at, FieldId};
// TODO do not de/serialize right bound when level = 0
pub struct FacetLevelValueF64Codec;
impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
type DItem = (FieldId, u8, f64, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (level, bytes) = bytes.split_first()?;
let (left, right) = if *level != 0 {
let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?;
let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?;
(left, right)
} else {
let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?;
(left, left)
};
Some((field_id, *level, left, right))
}
}
impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
type EItem = (FieldId, u8, f64, f64);
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 32];
let len = if *level != 0 {
// Write the globally ordered floats.
let bytes = f64_into_bytes(*left)?;
buffer[..8].copy_from_slice(&bytes[..]);
let bytes = f64_into_bytes(*right)?;
buffer[8..16].copy_from_slice(&bytes[..]);
// Then the f64 values just to be able to read them back.
let bytes = left.to_be_bytes();
buffer[16..24].copy_from_slice(&bytes[..]);
let bytes = right.to_be_bytes();
buffer[24..].copy_from_slice(&bytes[..]);
32 // length
} else {
// Write the globally ordered floats.
let bytes = f64_into_bytes(*left)?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 values just to be able to read them back.
let bytes = left.to_be_bytes();
buffer[8..16].copy_from_slice(&bytes[..]);
16 // length
};
let mut bytes = Vec::with_capacity(len + 3);
bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.push(*level);
bytes.extend_from_slice(&buffer[..len]);
Some(Cow::Owned(bytes))
}
}
#[cfg(test)]
mod tests {
use heed::{BytesDecode, BytesEncode};
use super::*;
#[test]
fn globally_ordered_f64() {
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap();
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0));
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap();
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0));
}
}

View file

@ -1,53 +0,0 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::num::NonZeroU8;
use crate::{try_split_array_at, FieldId};
/// A codec that stores the field id, level 1 and higher and the groups ids.
///
/// It can only be used to encode the facet string of the level 1 or higher.
pub struct FacetLevelValueU32Codec;
impl<'a> heed::BytesDecode<'a> for FacetLevelValueU32Codec {
type DItem = (FieldId, NonZeroU8, u32, u32);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (level, bytes) = bytes.split_first()?;
let level = NonZeroU8::new(*level)?;
let left = bytes[8..12].try_into().ok().map(u32::from_be_bytes)?;
let right = bytes[12..].try_into().ok().map(u32::from_be_bytes)?;
Some((field_id, level, left, right))
}
}
impl heed::BytesEncode<'_> for FacetLevelValueU32Codec {
type EItem = (FieldId, NonZeroU8, u32, u32);
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 16];
// Write the big-endian integers.
let bytes = left.to_be_bytes();
buffer[..4].copy_from_slice(&bytes[..]);
let bytes = right.to_be_bytes();
buffer[4..8].copy_from_slice(&bytes[..]);
// Then the u32 values just to be able to read them back.
let bytes = left.to_be_bytes();
buffer[8..12].copy_from_slice(&bytes[..]);
let bytes = right.to_be_bytes();
buffer[12..].copy_from_slice(&bytes[..]);
let mut bytes = Vec::with_capacity(buffer.len() + 2 + 1);
bytes.extend_from_slice(&field_id.to_be_bytes());
bytes.push(level.get());
bytes.extend_from_slice(&buffer);
Some(Cow::Owned(bytes))
}
}

View file

@ -1,50 +0,0 @@
use std::borrow::Cow;
use std::str;
use crate::{try_split_array_at, FieldId};
/// A codec that stores the field id, level 0, and facet string.
///
/// It can only be used to encode the facet string of the level 0,
/// as it hardcodes the level.
///
/// We encode the level 0 to not break the lexicographical ordering of the LMDB keys,
/// and make sure that the levels are not mixed-up. The level 0 is special, the key
/// are strings, other levels represent groups and keys are simply two integers.
pub struct FacetStringLevelZeroCodec;
impl FacetStringLevelZeroCodec {
pub fn serialize_into(field_id: FieldId, value: &str, out: &mut Vec<u8>) {
out.reserve(value.len() + 2);
out.extend_from_slice(&field_id.to_be_bytes());
out.push(0); // the level zero (for LMDB ordering only)
out.extend_from_slice(value.as_bytes());
}
}
impl<'a> heed::BytesDecode<'a> for FacetStringLevelZeroCodec {
type DItem = (FieldId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id_bytes, bytes) = try_split_array_at(bytes)?;
let field_id = u16::from_be_bytes(field_id_bytes);
let (level, bytes) = bytes.split_first()?;
if *level != 0 {
return None;
}
let value = str::from_utf8(bytes).ok()?;
Some((field_id, value))
}
}
impl<'a> heed::BytesEncode<'a> for FacetStringLevelZeroCodec {
type EItem = (FieldId, &'a str);
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::new();
FacetStringLevelZeroCodec::serialize_into(*field_id, value, &mut bytes);
Some(Cow::Owned(bytes))
}
}

View file

@ -1,90 +0,0 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::{marker, str};
use crate::error::SerializationError;
use crate::heed_codec::RoaringBitmapCodec;
use crate::{try_split_array_at, try_split_at, Result};
pub type FacetStringLevelZeroValueCodec = StringValueCodec<RoaringBitmapCodec>;
/// A codec that encodes a string in front of a value.
///
/// The usecase is for the facet string levels algorithm where we must know the
/// original string of a normalized facet value, the original values are stored
/// in the value to not break the lexicographical ordering of the LMDB keys.
pub struct StringValueCodec<C>(marker::PhantomData<C>);
impl<'a, C> heed::BytesDecode<'a> for StringValueCodec<C>
where
C: heed::BytesDecode<'a>,
{
type DItem = (&'a str, C::DItem);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (string, bytes) = decode_prefix_string(bytes)?;
C::bytes_decode(bytes).map(|item| (string, item))
}
}
impl<'a, C> heed::BytesEncode<'a> for StringValueCodec<C>
where
C: heed::BytesEncode<'a>,
{
type EItem = (&'a str, C::EItem);
fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
let value_bytes = C::bytes_encode(&value)?;
let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
encode_prefix_string(string, &mut bytes).ok()?;
bytes.extend_from_slice(&value_bytes[..]);
Some(Cow::Owned(bytes))
}
}
pub fn decode_prefix_string(value: &[u8]) -> Option<(&str, &[u8])> {
let (original_length_bytes, bytes) = try_split_array_at(value)?;
let original_length = u16::from_be_bytes(original_length_bytes) as usize;
let (string, bytes) = try_split_at(bytes, original_length)?;
let string = str::from_utf8(string).ok()?;
Some((string, bytes))
}
pub fn encode_prefix_string(string: &str, buffer: &mut Vec<u8>) -> Result<()> {
let string_len: u16 =
string.len().try_into().map_err(|_| SerializationError::InvalidNumberSerialization)?;
buffer.extend_from_slice(&string_len.to_be_bytes());
buffer.extend_from_slice(string.as_bytes());
Ok(())
}
#[cfg(test)]
mod tests {
use heed::types::Unit;
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;
use super::*;
#[test]
fn deserialize_roaring_bitmaps() {
let string = "abc";
let docids: RoaringBitmap = (0..100).chain(3500..4398).collect();
let key = (string, docids.clone());
let bytes = StringValueCodec::<RoaringBitmapCodec>::bytes_encode(&key).unwrap();
let (out_string, out_docids) =
StringValueCodec::<RoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
assert_eq!((out_string, out_docids), (string, docids));
}
#[test]
fn deserialize_unit() {
let string = "def";
let key = (string, ());
let bytes = StringValueCodec::<Unit>::bytes_encode(&key).unwrap();
let (out_string, out_unit) = StringValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
assert_eq!((out_string, out_unit), (string, ()));
}
}

View file

@ -1,19 +1,21 @@
mod facet_level_value_f64_codec;
mod facet_level_value_u32_codec;
mod facet_string_level_zero_codec;
mod facet_string_level_zero_value_codec;
// mod facet_level_value_f64_codec;
// mod facet_level_value_u32_codec;
// mod facet_string_level_zero_codec;
// mod facet_string_level_zero_value_codec;
mod facet_string_zero_bounds_value_codec;
mod field_doc_id_facet_f64_codec;
mod field_doc_id_facet_string_codec;
pub mod new;
use heed::types::OwnedType;
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
pub use self::facet_string_level_zero_value_codec::{
decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec,
};
// pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
// pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
// pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
// pub use self::facet_string_level_zero_value_codec::{
// decode_prefix_string, encode_prefix_string, FacetStringLevelZeroValueCodec,
// };
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;

View file

@ -0,0 +1,148 @@
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;
use std::{borrow::Cow, convert::TryFrom, marker::PhantomData};
pub mod ordered_f64_codec;
pub mod str_ref;
// TODO: these codecs were quickly written and not fast/resilient enough
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub struct FacetKey<T> {
pub field_id: u16,
pub level: u8,
pub left_bound: T,
}
impl<'a> FacetKey<&'a [u8]> {
pub fn into_owned(self) -> FacetKey<Vec<u8>> {
FacetKey {
field_id: self.field_id,
level: self.level,
left_bound: self.left_bound.to_vec(),
}
}
}
impl<'a> FacetKey<Vec<u8>> {
pub fn as_ref(&self) -> FacetKey<&[u8]> {
FacetKey {
field_id: self.field_id,
level: self.level,
left_bound: self.left_bound.as_slice(),
}
}
}
pub struct FacetGroupValue {
pub size: u8,
pub bitmap: RoaringBitmap,
}
pub struct FacetKeyCodec<T> {
_phantom: PhantomData<T>,
}
impl<'a, T> heed::BytesEncode<'a> for FacetKeyCodec<T>
where
T: BytesEncode<'a>,
T::EItem: Sized,
{
type EItem = FacetKey<T::EItem>;
fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let mut v = vec![];
v.extend_from_slice(&value.field_id.to_be_bytes());
v.extend_from_slice(&[value.level]);
let bound = T::bytes_encode(&value.left_bound).unwrap();
v.extend_from_slice(&bound);
Some(Cow::Owned(v))
}
}
impl<'a, T> heed::BytesDecode<'a> for FacetKeyCodec<T>
where
T: BytesDecode<'a>,
{
type DItem = FacetKey<T::DItem>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1]).unwrap());
let level = bytes[2];
let bound = T::bytes_decode(&bytes[3..]).unwrap();
Some(FacetKey { field_id: fid, level, left_bound: bound })
}
}
pub struct FacetGroupValueCodec;
impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
type EItem = FacetGroupValue;
fn bytes_encode(value: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
let mut v = vec![];
v.push(value.size);
value.bitmap.serialize_into(&mut v).unwrap();
Some(Cow::Owned(v))
}
}
impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
type DItem = FacetGroupValue;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let size = bytes[0];
let bitmap = RoaringBitmap::deserialize_from(&bytes[1..]).unwrap();
Some(FacetGroupValue { size, bitmap })
}
}
// TODO: get rid of this codec as it is named confusingly + should really be part of heed
// or even replace the current ByteSlice codec
pub struct MyByteSlice;
impl<'a> BytesEncode<'a> for MyByteSlice {
type EItem = &'a [u8];
fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item))
}
}
impl<'a> BytesDecode<'a> for MyByteSlice {
type DItem = &'a [u8];
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(bytes)
}
}
// I won't need these ones anymore
// pub struct U16Codec;
// impl<'a> BytesEncode<'a> for U16Codec {
// type EItem = u16;
// fn bytes_encode(item: &'a Self::EItem) -> Option<Cow<'a, [u8]>> {
// Some(Cow::Owned(item.to_be_bytes().to_vec()))
// }
// }
// impl<'a> BytesDecode<'a> for U16Codec {
// type DItem = u16;
// fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
// Some(u16::from_be_bytes(bytes[0..=1].try_into().unwrap()))
// }
// }
// pub struct StrCodec;
// impl<'a> BytesEncode<'a> for StrCodec {
// type EItem = &'a str;
// fn bytes_encode(item: &'a &'a str) -> Option<Cow<'a, [u8]>> {
// Some(Cow::Borrowed(item.as_bytes()))
// }
// }
// impl<'a> BytesDecode<'a> for StrCodec {
// type DItem = &'a str;
// fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
// let s = std::str::from_utf8(bytes).unwrap();
// Some(s)
// }
// }

View file

@ -0,0 +1,36 @@
use std::{borrow::Cow, convert::TryInto};
use heed::BytesDecode;
use crate::facet::value_encoding::f64_into_bytes;
pub struct OrderedF64Codec;
impl<'a> BytesDecode<'a> for OrderedF64Codec {
type DItem = f64;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
if bytes.len() < 16 {
return None;
}
let f = bytes[8..].try_into().ok().map(f64::from_be_bytes)?;
Some(f)
}
}
impl heed::BytesEncode<'_> for OrderedF64Codec {
type EItem = f64;
fn bytes_encode(f: &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 16];
// write the globally ordered float
let bytes = f64_into_bytes(*f)?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 value just to be able to read it back
let bytes = f.to_be_bytes();
buffer[8..16].copy_from_slice(&bytes[..]);
Some(Cow::Owned(buffer.to_vec()))
}
}

View file

@ -0,0 +1,20 @@
use std::borrow::Cow;
use heed::{BytesDecode, BytesEncode};
pub struct StrRefCodec;
impl<'a> BytesEncode<'a> for StrRefCodec {
type EItem = &'a str;
fn bytes_encode(item: &'a &'a str) -> Option<Cow<'a, [u8]>> {
Some(Cow::Borrowed(item.as_bytes()))
}
}
impl<'a> BytesDecode<'a> for StrRefCodec {
type DItem = &'a str;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let s = std::str::from_utf8(bytes).unwrap();
Some(s)
}
}