Introduce codecs for facet types (string, f64, u64, i64)

This commit is contained in:
Clément Renault 2020-11-11 15:48:24 +01:00
parent b4951c058b
commit e0058c1125
No known key found for this signature in database
GPG Key ID: 92ADA4E935E71FA4
11 changed files with 280 additions and 0 deletions

15
src/facet/facet_type.rs Normal file
View File

@ -0,0 +1,15 @@
use std::cmp;
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub enum FacetType {
String,
F64,
I64,
U64,
}
impl FacetType {
pub fn merge(a: FacetType, b: FacetType) -> FacetType {
cmp::min(a, b)
}
}

4
src/facet/mod.rs Normal file
View File

@ -0,0 +1,4 @@
mod facet_type;
pub mod value_encoding;
pub use self::facet_type::FacetType;

View File

@ -0,0 +1,89 @@
// https://stackoverflow.com/a/43305015/1941280
#[inline]
pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> {
if float.is_finite() {
if float == 0.0 || float == -0.0 {
return Some(xor_first_bit(0.0_f64.to_be_bytes()));
} else if float.is_sign_negative() {
return Some(xor_all_bits(float.to_be_bytes()));
} else if float.is_sign_positive() {
return Some(xor_first_bit(float.to_be_bytes()));
}
}
None
}
#[inline]
pub fn u64_into_bytes(int: u64) -> [u8; 8] {
int.to_be_bytes()
}
#[inline]
pub fn u64_from_bytes(bytes: [u8; 8]) -> u64 {
u64::from_be_bytes(bytes)
}
#[inline]
pub fn i64_into_bytes(int: i64) -> [u8; 8] {
xor_first_bit(int.to_be_bytes())
}
#[inline]
pub fn i64_from_bytes(bytes: [u8; 8]) -> i64 {
i64::from_be_bytes(xor_first_bit(bytes))
}
#[inline]
fn xor_first_bit(mut x: [u8; 8]) -> [u8; 8] {
x[0] ^= 0x80;
x
}
#[inline]
fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] {
x.iter_mut().for_each(|b| *b ^= 0xff);
x
}
#[cfg(test)]
mod tests {
use std::cmp::Ordering::Less;
use super::*;
fn is_sorted<T: Ord>(x: &[T]) -> bool {
x.windows(2).map(|x| x[0].cmp(&x[1])).all(|o| o == Less)
}
#[test]
fn ordered_f64_bytes() {
let a = -13_f64;
let b = -10.0;
let c = -0.0;
let d = 1.0;
let e = 43.0;
let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect();
assert!(is_sorted(&vec), "{:?}", vec);
}
#[test]
fn ordered_u64_bytes() {
let a = 0_u64;
let b = 1_u64;
let c = 43_u64;
let vec: Vec<_> = [a, b, c].iter().cloned().map(u64_into_bytes).collect();
assert!(is_sorted(&vec), "{:?}", vec);
}
#[test]
fn ordered_i64_bytes() {
let a = -10_i64;
let b = -0_i64;
let c = 1_i64;
let d = 43_i64;
let vec: Vec<_> = [a, b, c, d].iter().cloned().map(i64_into_bytes).collect();
assert!(is_sorted(&vec), "{:?}", vec);
}
}

View File

@ -0,0 +1,50 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use crate::heed_codec::StrBytesCodec;
use crate::facet::value_encoding::f64_into_bytes;
pub struct FacetValueF64Codec;
impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec {
type DItem = (&'a str, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (name, buffer) = StrBytesCodec::bytes_decode(bytes)?;
let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?;
Some((name, value))
}
}
impl<'a> heed::BytesEncode<'a> for FacetValueF64Codec {
type EItem = (&'a str, f64);
fn bytes_encode((name, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 16];
// Write the globally ordered float.
let bytes = f64_into_bytes(*value)?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 value just to be able to read it back.
let bytes = value.to_be_bytes();
buffer[8..].copy_from_slice(&bytes[..]);
let tuple = (*name, &buffer[..]);
StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned)
}
}
#[cfg(test)]
mod tests {
use heed::{BytesEncode, BytesDecode};
use super::*;
#[test]
fn globally_ordered_f64() {
let bytes = FacetValueF64Codec::bytes_encode(&("hello", -32.0)).unwrap();
let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, value), ("hello", -32.0));
}
}

View File

@ -0,0 +1,28 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use crate::heed_codec::StrBytesCodec;
use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes};
pub struct FacetValueI64Codec;
impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec {
type DItem = (&'a str, i64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?;
let value = bytes.try_into().map(i64_from_bytes).ok()?;
Some((name, value))
}
}
impl<'a> heed::BytesEncode<'a> for FacetValueI64Codec {
type EItem = (&'a str, i64);
fn bytes_encode((name, value): &Self::EItem) -> Option<Cow<[u8]>> {
let value = i64_into_bytes(*value);
let tuple = (*name, &value[..]);
StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned)
}
}

View File

@ -0,0 +1,25 @@
use std::borrow::Cow;
use std::str;
use crate::heed_codec::StrBytesCodec;
pub struct FacetValueStringCodec;
impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec {
type DItem = (&'a str, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?;
let value = str::from_utf8(bytes).ok()?;
Some((name, value))
}
}
impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec {
type EItem = (&'a str, &'a str);
fn bytes_encode((name, value): &Self::EItem) -> Option<Cow<[u8]>> {
let tuple = (*name, value.as_bytes());
StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned)
}
}

View File

@ -0,0 +1,28 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use crate::heed_codec::StrBytesCodec;
use crate::facet::value_encoding::{u64_from_bytes, u64_into_bytes};
pub struct FacetValueU64Codec;
impl<'a> heed::BytesDecode<'a> for FacetValueU64Codec {
type DItem = (&'a str, u64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?;
let value = bytes.try_into().map(u64_from_bytes).ok()?;
Some((name, value))
}
}
impl<'a> heed::BytesEncode<'a> for FacetValueU64Codec {
type EItem = (&'a str, u64);
fn bytes_encode((name, value): &Self::EItem) -> Option<Cow<[u8]>> {
let value = u64_into_bytes(*value);
let tuple = (*name, &value[..]);
StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned)
}
}

View File

@ -0,0 +1,9 @@
mod facet_value_f64_codec;
mod facet_value_i64_codec;
mod facet_value_string_codec;
mod facet_value_u64_codec;
pub use self::facet_value_f64_codec::FacetValueF64Codec;
pub use self::facet_value_i64_codec::FacetValueI64Codec;
pub use self::facet_value_string_codec::FacetValueStringCodec;
pub use self::facet_value_u64_codec::FacetValueU64Codec;

View File

@ -1,8 +1,10 @@
mod beu32_str_codec;
mod bo_roaring_bitmap_codec;
mod cbo_roaring_bitmap_codec;
mod facet;
mod obkv_codec;
mod roaring_bitmap_codec;
mod str_bytes_codec;
mod str_str_u8_codec;
pub use self::beu32_str_codec::BEU32StrCodec;
@ -10,4 +12,5 @@ pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
pub use self::str_bytes_codec::StrBytesCodec;
pub use self::str_str_u8_codec::StrStrU8Codec;

View File

@ -0,0 +1,28 @@
use std::borrow::Cow;
use std::str;
pub struct StrBytesCodec;
impl<'a> heed::BytesDecode<'a> for StrBytesCodec {
type DItem = (&'a str, &'a [u8]);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let s1_end = bytes.iter().position(|b| *b == 0)?;
let (s1_bytes, s2_bytes) = bytes.split_at(s1_end);
let s1 = str::from_utf8(s1_bytes).ok()?;
let s2 = &s2_bytes[1..];
Some((s1, s2))
}
}
impl<'a> heed::BytesEncode<'a> for StrBytesCodec {
type EItem = (&'a str, &'a [u8]);
fn bytes_encode((s1, s2): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
bytes.extend_from_slice(s1.as_bytes());
bytes.push(0);
bytes.extend_from_slice(s2);
Some(Cow::Owned(bytes))
}
}

View File

@ -4,6 +4,7 @@ mod index;
mod mdfs;
mod query_tokens;
mod search;
pub mod facet;
pub mod heed_codec;
pub mod proximity;
pub mod subcommand;