Change the project to become a workspace with milli as a default-member

This commit is contained in:
Clément Renault 2021-02-12 16:15:09 +01:00
parent d450b971f9
commit e8639517da
No known key found for this signature in database
GPG key ID: 92ADA4E935E71FA4
56 changed files with 1053 additions and 2617 deletions

View file

@ -0,0 +1,27 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
pub struct BEU32StrCodec;
impl<'a> heed::BytesDecode<'a> for BEU32StrCodec {
type DItem = (u32, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n_bytes, str_bytes) = bytes.split_at(4);
let n = n_bytes.try_into().map(u32::from_be_bytes).ok()?;
let s = str::from_utf8(str_bytes).ok()?;
Some((n, s))
}
}
impl<'a> heed::BytesEncode<'a> for BEU32StrCodec {
type EItem = (u32, &'a str);
fn bytes_encode((n, s): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s.len() + 4);
bytes.extend_from_slice(&n.to_be_bytes());
bytes.extend_from_slice(s.as_bytes());
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,29 @@
use std::borrow::Cow;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use roaring::RoaringBitmap;
pub struct BoRoaringBitmapCodec;
impl heed::BytesDecode<'_> for BoRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(mut bytes: &[u8]) -> Option<Self::DItem> {
let mut bitmap = RoaringBitmap::new();
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
bitmap.insert(integer);
}
Some(bitmap)
}
}
impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(item.len() as usize * 4);
for integer in item.iter() {
bytes.write_u32::<NativeEndian>(integer).ok()?;
}
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,108 @@
use std::borrow::Cow;
use std::io;
use std::mem::size_of;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use roaring::RoaringBitmap;
/// This is the limit where using a byteorder became less size efficient
/// than using a direct roaring encoding, it is also the point where we are able
/// to determine the encoding used only by using the array of bytes length.
const THRESHOLD: usize = 7;
/// A conditionnal codec that either use the RoaringBitmap
/// or a lighter ByteOrder en/decoding method.
pub struct CboRoaringBitmapCodec;
impl CboRoaringBitmapCodec {
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
if roaring.len() <= THRESHOLD as u64 {
roaring.len() as usize * size_of::<u32>()
} else {
roaring.serialized_size()
}
}
pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec<u8>) -> io::Result<()> {
if roaring.len() <= THRESHOLD as u64 {
// If the number of items (u32s) to encode is less than or equal to the threshold
// it means that it would weigh the same or less than the RoaringBitmap
// header, so we directly encode them using ByteOrder instead.
for integer in roaring {
vec.write_u32::<NativeEndian>(integer)?;
}
Ok(())
} else {
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
roaring.serialize_into(vec)
}
}
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
if bytes.len() <= THRESHOLD * size_of::<u32>() {
// If there is threshold or less than threshold integers that can fit into this array
// of bytes it means that we used the ByteOrder codec serializer.
let mut bitmap = RoaringBitmap::new();
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
bitmap.insert(integer);
}
Ok(bitmap)
} else {
// Otherwise, it means we used the classic RoaringBitmapCodec and
// that the header takes threshold integers.
RoaringBitmap::deserialize_from(bytes)
}
}
}
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
Self::deserialize_from(bytes).ok()
}
}
impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut vec = Vec::with_capacity(Self::serialized_size(item));
Self::serialize_into(item, &mut vec).ok()?;
Some(Cow::Owned(vec))
}
}
#[cfg(test)]
mod tests {
use std::iter::FromIterator;
use heed::{BytesEncode, BytesDecode};
use super::*;
#[test]
fn verify_encoding_decoding() {
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
let bytes = CboRoaringBitmapCodec::bytes_encode(&input).unwrap();
let output = CboRoaringBitmapCodec::bytes_decode(&bytes).unwrap();
assert_eq!(input, output);
}
#[test]
fn verify_threshold() {
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
// use roaring bitmap
let mut bytes = Vec::new();
input.serialize_into(&mut bytes).unwrap();
let roaring_size = bytes.len();
// use byteorder directly
let mut bytes = Vec::new();
for integer in input {
bytes.write_u32::<NativeEndian>(integer).unwrap();
}
let bo_size = bytes.len();
assert!(roaring_size > bo_size);
}
}

View file

@ -0,0 +1,87 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::f64_into_bytes;
use crate::FieldId;
// TODO do not de/serialize right bound when level = 0
pub struct FacetLevelValueF64Codec;
impl<'a> heed::BytesDecode<'a> for FacetLevelValueF64Codec {
type DItem = (FieldId, u8, f64, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (level, bytes) = bytes.split_first()?;
let (left, right) = if *level != 0 {
let left = bytes[16..24].try_into().ok().map(f64::from_be_bytes)?;
let right = bytes[24..].try_into().ok().map(f64::from_be_bytes)?;
(left, right)
} else {
let left = bytes[8..].try_into().ok().map(f64::from_be_bytes)?;
(left, left)
};
Some((*field_id, *level, left, right))
}
}
impl heed::BytesEncode<'_> for FacetLevelValueF64Codec {
type EItem = (FieldId, u8, f64, f64);
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let mut buffer = [0u8; 32];
let len = if *level != 0 {
// Write the globally ordered floats.
let bytes = f64_into_bytes(*left)?;
buffer[..8].copy_from_slice(&bytes[..]);
let bytes = f64_into_bytes(*right)?;
buffer[8..16].copy_from_slice(&bytes[..]);
// Then the f64 values just to be able to read them back.
let bytes = left.to_be_bytes();
buffer[16..24].copy_from_slice(&bytes[..]);
let bytes = right.to_be_bytes();
buffer[24..].copy_from_slice(&bytes[..]);
32 // length
} else {
// Write the globally ordered floats.
let bytes = f64_into_bytes(*left)?;
buffer[..8].copy_from_slice(&bytes[..]);
// Then the f64 values just to be able to read them back.
let bytes = left.to_be_bytes();
buffer[8..16].copy_from_slice(&bytes[..]);
16 // length
};
let mut bytes = Vec::with_capacity(len + 2);
bytes.push(*field_id);
bytes.push(*level);
bytes.extend_from_slice(&buffer[..len]);
Some(Cow::Owned(bytes))
}
}
#[cfg(test)]
mod tests {
use heed::{BytesEncode, BytesDecode};
use super::*;
#[test]
fn globally_ordered_f64() {
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 0, 32.0, 0.0)).unwrap();
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, level, left, right), (3, 0, 32.0, 32.0));
let bytes = FacetLevelValueF64Codec::bytes_encode(&(3, 1, -32.0, 32.0)).unwrap();
let (name, level, left, right) = FacetLevelValueF64Codec::bytes_decode(&bytes).unwrap();
assert_eq!((name, level, left, right), (3, 1, -32.0, 32.0));
}
}

View file

@ -0,0 +1,44 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::{i64_from_bytes, i64_into_bytes};
use crate::FieldId;
pub struct FacetLevelValueI64Codec;
impl<'a> heed::BytesDecode<'a> for FacetLevelValueI64Codec {
type DItem = (FieldId, u8, i64, i64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (level, bytes) = bytes.split_first()?;
let left = bytes[..8].try_into().map(i64_from_bytes).ok()?;
let right = if *level != 0 {
bytes[8..].try_into().map(i64_from_bytes).ok()?
} else {
left
};
Some((*field_id, *level, left, right))
}
}
impl heed::BytesEncode<'_> for FacetLevelValueI64Codec {
type EItem = (FieldId, u8, i64, i64);
fn bytes_encode((field_id, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let left = i64_into_bytes(*left);
let right = i64_into_bytes(*right);
let mut bytes = Vec::with_capacity(2 + left.len() + right.len());
bytes.push(*field_id);
bytes.push(*level);
bytes.extend_from_slice(&left[..]);
if *level != 0 {
bytes.extend_from_slice(&right[..]);
}
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,27 @@
use std::borrow::Cow;
use std::str;
use crate::FieldId;
pub struct FacetValueStringCodec;
impl<'a> heed::BytesDecode<'a> for FacetValueStringCodec {
type DItem = (FieldId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let value = str::from_utf8(bytes).ok()?;
Some((*field_id, value))
}
}
impl<'a> heed::BytesEncode<'a> for FacetValueStringCodec {
type EItem = (FieldId, &'a str);
fn bytes_encode((field_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(value.len() + 1);
bytes.push(*field_id);
bytes.extend_from_slice(value.as_bytes());
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,36 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::{FieldId, DocumentId};
use crate::facet::value_encoding::f64_into_bytes;
pub struct FieldDocIdFacetF64Codec;
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetF64Codec {
type DItem = (FieldId, DocumentId, f64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (document_id_bytes, bytes) = bytes.split_at(4);
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
let value = bytes[8..16].try_into().map(f64::from_be_bytes).ok()?;
Some((*field_id, document_id, value))
}
}
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetF64Codec {
type EItem = (FieldId, DocumentId, f64);
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(1 + 4 + 8 + 8);
bytes.push(*field_id);
bytes.extend_from_slice(&document_id.to_be_bytes());
let value_bytes = f64_into_bytes(*value)?;
bytes.extend_from_slice(&value_bytes);
bytes.extend_from_slice(&value.to_be_bytes());
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,34 @@
use std::borrow::Cow;
use std::convert::TryInto;
use crate::facet::value_encoding::{i64_into_bytes, i64_from_bytes};
use crate::{FieldId, DocumentId};
pub struct FieldDocIdFacetI64Codec;
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetI64Codec {
type DItem = (FieldId, DocumentId, i64);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (document_id_bytes, bytes) = bytes.split_at(4);
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
let value = bytes[..8].try_into().map(i64_from_bytes).ok()?;
Some((*field_id, document_id, value))
}
}
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetI64Codec {
type EItem = (FieldId, DocumentId, i64);
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(1 + 4 + 8);
bytes.push(*field_id);
bytes.extend_from_slice(&document_id.to_be_bytes());
bytes.extend_from_slice(&i64_into_bytes(*value));
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,31 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::str;
use crate::{FieldId, DocumentId};
pub struct FieldDocIdFacetStringCodec;
impl<'a> heed::BytesDecode<'a> for FieldDocIdFacetStringCodec {
type DItem = (FieldId, DocumentId, &'a str);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (field_id, bytes) = bytes.split_first()?;
let (document_id_bytes, bytes) = bytes.split_at(4);
let document_id = document_id_bytes.try_into().map(u32::from_be_bytes).ok()?;
let value = str::from_utf8(bytes).ok()?;
Some((*field_id, document_id, value))
}
}
impl<'a> heed::BytesEncode<'a> for FieldDocIdFacetStringCodec {
type EItem = (FieldId, DocumentId, &'a str);
fn bytes_encode((field_id, document_id, value): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(1 + 4 + value.len());
bytes.push(*field_id);
bytes.extend_from_slice(&document_id.to_be_bytes());
bytes.extend_from_slice(value.as_bytes());
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,13 @@
mod facet_level_value_f64_codec;
mod facet_level_value_i64_codec;
mod facet_value_string_codec;
mod field_doc_id_facet_f64_codec;
mod field_doc_id_facet_i64_codec;
mod field_doc_id_facet_string_codec;
pub use self::facet_level_value_f64_codec::FacetLevelValueF64Codec;
pub use self::facet_level_value_i64_codec::FacetLevelValueI64Codec;
pub use self::facet_value_string_codec::FacetValueStringCodec;
pub use self::field_doc_id_facet_f64_codec::FieldDocIdFacetF64Codec;
pub use self::field_doc_id_facet_i64_codec::FieldDocIdFacetI64Codec;
pub use self::field_doc_id_facet_string_codec::FieldDocIdFacetStringCodec;

View file

@ -0,0 +1,14 @@
mod beu32_str_codec;
mod bo_roaring_bitmap_codec;
mod cbo_roaring_bitmap_codec;
mod obkv_codec;
mod roaring_bitmap_codec;
mod str_str_u8_codec;
pub mod facet;
pub use self::beu32_str_codec::BEU32StrCodec;
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
pub use self::obkv_codec::ObkvCodec;
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
pub use self::str_str_u8_codec::StrStrU8Codec;

View file

@ -0,0 +1,20 @@
use std::borrow::Cow;
use obkv::{KvReader, KvWriter};
pub struct ObkvCodec;
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
type DItem = KvReader<'a>;
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
Some(KvReader::new(bytes))
}
}
impl heed::BytesEncode<'_> for ObkvCodec {
type EItem = KvWriter<Vec<u8>>;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
item.clone().into_inner().map(Cow::Owned).ok()
}
}

View file

@ -0,0 +1,22 @@
use std::borrow::Cow;
use roaring::RoaringBitmap;
pub struct RoaringBitmapCodec;
impl heed::BytesDecode<'_> for RoaringBitmapCodec {
type DItem = RoaringBitmap;
fn bytes_decode(bytes: &[u8]) -> Option<Self::DItem> {
RoaringBitmap::deserialize_from(bytes).ok()
}
}
impl heed::BytesEncode<'_> for RoaringBitmapCodec {
type EItem = RoaringBitmap;
fn bytes_encode(item: &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(item.serialized_size());
item.serialize_into(&mut bytes).ok()?;
Some(Cow::Owned(bytes))
}
}

View file

@ -0,0 +1,30 @@
use std::borrow::Cow;
use std::str;
pub struct StrStrU8Codec;
impl<'a> heed::BytesDecode<'a> for StrStrU8Codec {
type DItem = (&'a str, &'a str, u8);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let (n, bytes) = bytes.split_last()?;
let s1_end = bytes.iter().position(|b| *b == 0)?;
let (s1_bytes, s2_bytes) = bytes.split_at(s1_end);
let s1 = str::from_utf8(s1_bytes).ok()?;
let s2 = str::from_utf8(&s2_bytes[1..]).ok()?;
Some((s1, s2, *n))
}
}
impl<'a> heed::BytesEncode<'a> for StrStrU8Codec {
type EItem = (&'a str, &'a str, u8);
fn bytes_encode((s1, s2, n): &Self::EItem) -> Option<Cow<[u8]>> {
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1 + 1);
bytes.extend_from_slice(s1.as_bytes());
bytes.push(0);
bytes.extend_from_slice(s2.as_bytes());
bytes.push(*n);
Some(Cow::Owned(bytes))
}
}