Return the original string values for the inverted facet index database

This commit is contained in:
Clément Renault 2021-07-17 12:50:01 +02:00 committed by Kerollmops
parent 03a01166ba
commit 0227254a65
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
15 changed files with 242 additions and 58 deletions

View file

@ -0,0 +1,80 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::{marker, str};
use super::try_split_at;
/// A codec that encodes a string in front of the value.
///
/// The usecase is for the facet string levels algorithm where we must know the
/// original string of a normalized facet value, the original values are stored
/// in the value to not break the lexicographical ordering of the LMDB keys.
pub struct FacetStringLevelZeroValueCodec<C>(marker::PhantomData<C>);
impl<'a, C> heed::BytesDecode<'a> for FacetStringLevelZeroValueCodec<C>
where
C: heed::BytesDecode<'a>,
{
type DItem = (&'a str, C::DItem);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (string_len, bytes) = try_split_at(bytes, 2)?;
let string_len = string_len.try_into().ok().map(u16::from_be_bytes)?;
let (string, bytes) = try_split_at(bytes, string_len as usize)?;
let string = str::from_utf8(string).ok()?;
C::bytes_decode(bytes).map(|item| (string, item))
}
}
impl<'a, C> heed::BytesEncode<'a> for FacetStringLevelZeroValueCodec<C>
where
C: heed::BytesEncode<'a>,
{
type EItem = (&'a str, C::EItem);
fn bytes_encode((string, value): &'a Self::EItem) -> Option<Cow<[u8]>> {
let string_len: u16 = string.len().try_into().ok()?;
let value_bytes = C::bytes_encode(&value)?;
let mut bytes = Vec::with_capacity(2 + string.len() + value_bytes.len());
bytes.extend_from_slice(&string_len.to_be_bytes());
bytes.extend_from_slice(string.as_bytes());
bytes.extend_from_slice(&value_bytes[..]);
Some(Cow::Owned(bytes))
}
}
#[cfg(test)]
mod tests {
use heed::types::Unit;
use heed::{BytesDecode, BytesEncode};
use roaring::RoaringBitmap;
use super::*;
use crate::CboRoaringBitmapCodec;
#[test]
fn deserialize_roaring_bitmaps() {
let string = "abc";
let docids: RoaringBitmap = (0..100).chain(3500..4398).collect();
let key = (string, docids.clone());
let bytes =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_encode(&key).unwrap();
let (out_string, out_docids) =
FacetStringLevelZeroValueCodec::<CboRoaringBitmapCodec>::bytes_decode(&bytes).unwrap();
assert_eq!((out_string, out_docids), (string, docids));
}
#[test]
fn deserialize_unit() {
let string = "def";
let key = (string, ());
let bytes = FacetStringLevelZeroValueCodec::<Unit>::bytes_encode(&key).unwrap();
let (out_string, out_unit) =
FacetStringLevelZeroValueCodec::<Unit>::bytes_decode(&bytes).unwrap();
assert_eq!((out_string, out_unit), (string, ()));
}
}

View file

@ -2,7 +2,9 @@ use std::borrow::Cow;
use std::convert::TryInto;
use std::{marker, str};
/// A codec that encodes two strings in front of the value.
use super::try_split_at;
/// A codec that optionally encodes two strings in front of the value.
///
/// The usecase is for the facet string levels algorithm where we must
/// know the origin of a group, the group left and right bounds are stored
@ -79,16 +81,6 @@ where
}
}
/// Tries to split a slice in half at the given middle point,
/// `None` if the slice is too short.
fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
if slice.len() >= mid {
Some(slice.split_at(mid))
} else {
None
}
}
#[cfg(test)]
mod tests {
use heed::types::Unit;

View file

@ -1,6 +1,7 @@
mod facet_level_value_f64_codec;
mod facet_level_value_u32_codec;
mod facet_string_level_zero_codec;
mod facet_string_level_zero_value_codec;
mod facet_string_zero_bounds_value_codec;
mod field_doc_id_facet_f64_codec;
mod field_doc_id_facet_string_codec;
@ -8,6 +9,17 @@ mod field_doc_id_facet_string_codec;
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
pub use self::facet_level_value_u32_codec::FacetLevelValueU32Codec;
pub use self::facet_string_level_zero_codec::FacetStringLevelZeroCodec;
pub use self::facet_string_level_zero_value_codec::FacetStringLevelZeroValueCodec;
pub use self::facet_string_zero_bounds_value_codec::FacetStringZeroBoundsValueCodec;
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;
/// Tries to split a slice in half at the given middle point,
/// `None` if the slice is too short.
pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
if slice.len() >= mid {
Some(slice.split_at(mid))
} else {
None
}
}