From e0058c1125e1a383a485358be2cf87d04c860669 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Wed, 11 Nov 2020 15:48:24 +0100 Subject: [PATCH] Introduce codecs for facet types (string, f64, u64, i64) --- src/facet/facet_type.rs | 15 ++++ src/facet/mod.rs | 4 + src/facet/value_encoding.rs | 89 +++++++++++++++++++ src/heed_codec/facet/facet_value_f64_codec.rs | 50 +++++++++++ src/heed_codec/facet/facet_value_i64_codec.rs | 28 ++++++ .../facet/facet_value_string_codec.rs | 25 ++++++ src/heed_codec/facet/facet_value_u64_codec.rs | 28 ++++++ src/heed_codec/facet/mod.rs | 9 ++ src/heed_codec/mod.rs | 3 + src/heed_codec/str_bytes_codec.rs | 28 ++++++ src/lib.rs | 1 + 11 files changed, 280 insertions(+) create mode 100644 src/facet/facet_type.rs create mode 100644 src/facet/mod.rs create mode 100644 src/facet/value_encoding.rs create mode 100644 src/heed_codec/facet/facet_value_f64_codec.rs create mode 100644 src/heed_codec/facet/facet_value_i64_codec.rs create mode 100644 src/heed_codec/facet/facet_value_string_codec.rs create mode 100644 src/heed_codec/facet/facet_value_u64_codec.rs create mode 100644 src/heed_codec/facet/mod.rs create mode 100644 src/heed_codec/str_bytes_codec.rs diff --git a/src/facet/facet_type.rs b/src/facet/facet_type.rs new file mode 100644 index 000000000..13482e8b3 --- /dev/null +++ b/src/facet/facet_type.rs @@ -0,0 +1,15 @@ +use std::cmp; + +#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)] +pub enum FacetType { + String, + F64, + I64, + U64, +} + +impl FacetType { + pub fn merge(a: FacetType, b: FacetType) -> FacetType { + cmp::min(a, b) + } +} diff --git a/src/facet/mod.rs b/src/facet/mod.rs new file mode 100644 index 000000000..9ec99f2d3 --- /dev/null +++ b/src/facet/mod.rs @@ -0,0 +1,4 @@ +mod facet_type; +pub mod value_encoding; + +pub use self::facet_type::FacetType; diff --git a/src/facet/value_encoding.rs b/src/facet/value_encoding.rs new file mode 100644 index 000000000..01fdba723 --- /dev/null +++ b/src/facet/value_encoding.rs @@ -0,0 +1,89 @@ +// https://stackoverflow.com/a/43305015/1941280 +#[inline] +pub fn f64_into_bytes(float: f64) -> Option<[u8; 8]> { + if float.is_finite() { + if float == 0.0 || float == -0.0 { + return Some(xor_first_bit(0.0_f64.to_be_bytes())); + } else if float.is_sign_negative() { + return Some(xor_all_bits(float.to_be_bytes())); + } else if float.is_sign_positive() { + return Some(xor_first_bit(float.to_be_bytes())); + } + } + None +} + +#[inline] +pub fn u64_into_bytes(int: u64) -> [u8; 8] { + int.to_be_bytes() +} + +#[inline] +pub fn u64_from_bytes(bytes: [u8; 8]) -> u64 { + u64::from_be_bytes(bytes) +} + +#[inline] +pub fn i64_into_bytes(int: i64) -> [u8; 8] { + xor_first_bit(int.to_be_bytes()) +} + +#[inline] +pub fn i64_from_bytes(bytes: [u8; 8]) -> i64 { + i64::from_be_bytes(xor_first_bit(bytes)) +} + +#[inline] +fn xor_first_bit(mut x: [u8; 8]) -> [u8; 8] { + x[0] ^= 0x80; + x +} + +#[inline] +fn xor_all_bits(mut x: [u8; 8]) -> [u8; 8] { + x.iter_mut().for_each(|b| *b ^= 0xff); + x +} + +#[cfg(test)] +mod tests { + use std::cmp::Ordering::Less; + use super::*; + + fn is_sorted(x: &[T]) -> bool { + x.windows(2).map(|x| x[0].cmp(&x[1])).all(|o| o == Less) + } + + #[test] + fn ordered_f64_bytes() { + let a = -13_f64; + let b = -10.0; + let c = -0.0; + let d = 1.0; + let e = 43.0; + + let vec: Vec<_> = [a, b, c, d, e].iter().cloned().map(f64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } + + #[test] + fn ordered_u64_bytes() { + let a = 0_u64; + let b = 1_u64; + let c = 43_u64; + + let vec: Vec<_> = [a, b, c].iter().cloned().map(u64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } + + #[test] + fn ordered_i64_bytes() { + let a = -10_i64; + let b = -0_i64; + let c = 1_i64; + let d = 43_i64; + + let vec: Vec<_> = [a, b, c, d].iter().cloned().map(i64_into_bytes).collect(); + assert!(is_sorted(&vec), "{:?}", vec); + } +} diff --git a/src/heed_codec/facet/facet_value_f64_codec.rs b/src/heed_codec/facet/facet_value_f64_codec.rs new file mode 100644 index 000000000..fdacb1f08 --- /dev/null +++ b/src/heed_codec/facet/facet_value_f64_codec.rs @@ -0,0 +1,50 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::heed_codec::StrBytesCodec; +use crate::facet::value_encoding::f64_into_bytes; + +pub struct FacetValueF64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueF64Codec { + type DItem = (&'a str, f64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, buffer) = StrBytesCodec::bytes_decode(bytes)?; + let value = buffer[8..].try_into().ok().map(f64::from_be_bytes)?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueF64Codec { + type EItem = (&'a str, f64); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let mut buffer = [0u8; 16]; + + // Write the globally ordered float. + let bytes = f64_into_bytes(*value)?; + buffer[..8].copy_from_slice(&bytes[..]); + + // Then the f64 value just to be able to read it back. + let bytes = value.to_be_bytes(); + buffer[8..].copy_from_slice(&bytes[..]); + + let tuple = (*name, &buffer[..]); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} + +#[cfg(test)] +mod tests { + use heed::{BytesEncode, BytesDecode}; + use super::*; + + #[test] + fn globally_ordered_f64() { + let bytes = FacetValueF64Codec::bytes_encode(&("hello", -32.0)).unwrap(); + let (name, value) = FacetValueF64Codec::bytes_decode(&bytes).unwrap(); + assert_eq!((name, value), ("hello", -32.0)); + } +} diff --git a/src/heed_codec/facet/facet_value_i64_codec.rs b/src/heed_codec/facet/facet_value_i64_codec.rs new file mode 100644 index 000000000..e3c333883 --- /dev/null +++ b/src/heed_codec/facet/facet_value_i64_codec.rs @@ -0,0 +1,28 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::heed_codec::StrBytesCodec; +use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes}; + +pub struct FacetValueI64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueI64Codec { + type DItem = (&'a str, i64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; + let value = bytes.try_into().map(i64_from_bytes).ok()?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueI64Codec { + type EItem = (&'a str, i64); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let value = i64_into_bytes(*value); + let tuple = (*name, &value[..]); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} diff --git a/src/heed_codec/facet/facet_value_string_codec.rs b/src/heed_codec/facet/facet_value_string_codec.rs new file mode 100644 index 000000000..8b046192a --- /dev/null +++ b/src/heed_codec/facet/facet_value_string_codec.rs @@ -0,0 +1,25 @@ +use std::borrow::Cow; +use std::str; + +use crate::heed_codec::StrBytesCodec; + +pub struct FacetValueStringCodec; + +impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec { + type DItem = (&'a str, &'a str); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; + let value = str::from_utf8(bytes).ok()?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec { + type EItem = (&'a str, &'a str); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let tuple = (*name, value.as_bytes()); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} diff --git a/src/heed_codec/facet/facet_value_u64_codec.rs b/src/heed_codec/facet/facet_value_u64_codec.rs new file mode 100644 index 000000000..2fdc7416e --- /dev/null +++ b/src/heed_codec/facet/facet_value_u64_codec.rs @@ -0,0 +1,28 @@ +use std::borrow::Cow; +use std::convert::TryInto; +use std::str; + +use crate::heed_codec::StrBytesCodec; +use crate::facet::value_encoding::{u64_from_bytes, u64_into_bytes}; + +pub struct FacetValueU64Codec; + +impl<'a> heed::BytesDecode<'a> for FacetValueU64Codec { + type DItem = (&'a str, u64); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let (name, bytes) = StrBytesCodec::bytes_decode(bytes)?; + let value = bytes.try_into().map(u64_from_bytes).ok()?; + Some((name, value)) + } +} + +impl<'a> heed::BytesEncode<'a> for FacetValueU64Codec { + type EItem = (&'a str, u64); + + fn bytes_encode((name, value): &Self::EItem) -> Option> { + let value = u64_into_bytes(*value); + let tuple = (*name, &value[..]); + StrBytesCodec::bytes_encode(&tuple).map(Cow::into_owned).map(Cow::Owned) + } +} diff --git a/src/heed_codec/facet/mod.rs b/src/heed_codec/facet/mod.rs new file mode 100644 index 000000000..ba34f6220 --- /dev/null +++ b/src/heed_codec/facet/mod.rs @@ -0,0 +1,9 @@ +mod facet_value_f64_codec; +mod facet_value_i64_codec; +mod facet_value_string_codec; +mod facet_value_u64_codec; + +pub use self::facet_value_f64_codec::FacetValueF64Codec; +pub use self::facet_value_i64_codec::FacetValueI64Codec; +pub use self::facet_value_string_codec::FacetValueStringCodec; +pub use self::facet_value_u64_codec::FacetValueU64Codec; diff --git a/src/heed_codec/mod.rs b/src/heed_codec/mod.rs index 68739fbf1..260e79c4b 100644 --- a/src/heed_codec/mod.rs +++ b/src/heed_codec/mod.rs @@ -1,8 +1,10 @@ mod beu32_str_codec; mod bo_roaring_bitmap_codec; mod cbo_roaring_bitmap_codec; +mod facet; mod obkv_codec; mod roaring_bitmap_codec; +mod str_bytes_codec; mod str_str_u8_codec; pub use self::beu32_str_codec::BEU32StrCodec; @@ -10,4 +12,5 @@ pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec; pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec; pub use self::obkv_codec::ObkvCodec; pub use self::roaring_bitmap_codec::RoaringBitmapCodec; +pub use self::str_bytes_codec::StrBytesCodec; pub use self::str_str_u8_codec::StrStrU8Codec; diff --git a/src/heed_codec/str_bytes_codec.rs b/src/heed_codec/str_bytes_codec.rs new file mode 100644 index 000000000..1a864c1fb --- /dev/null +++ b/src/heed_codec/str_bytes_codec.rs @@ -0,0 +1,28 @@ +use std::borrow::Cow; +use std::str; + +pub struct StrBytesCodec; + +impl<'a> heed::BytesDecode<'a> for StrBytesCodec { + type DItem = (&'a str, &'a [u8]); + + fn bytes_decode(bytes: &'a [u8]) -> Option { + let s1_end = bytes.iter().position(|b| *b == 0)?; + let (s1_bytes, s2_bytes) = bytes.split_at(s1_end); + let s1 = str::from_utf8(s1_bytes).ok()?; + let s2 = &s2_bytes[1..]; + Some((s1, s2)) + } +} + +impl<'a> heed::BytesEncode<'a> for StrBytesCodec { + type EItem = (&'a str, &'a [u8]); + + fn bytes_encode((s1, s2): &Self::EItem) -> Option> { + let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1); + bytes.extend_from_slice(s1.as_bytes()); + bytes.push(0); + bytes.extend_from_slice(s2); + Some(Cow::Owned(bytes)) + } +} diff --git a/src/lib.rs b/src/lib.rs index bea05e68a..2e879fd03 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ mod index; mod mdfs; mod query_tokens; mod search; +pub mod facet; pub mod heed_codec; pub mod proximity; pub mod subcommand;