mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 11:57:07 +02:00
Merge branch 'main' into indexer-edition-2024
This commit is contained in:
commit
10feeb88f2
1122 changed files with 6265 additions and 5265 deletions
29
crates/milli/src/heed_codec/beu16_str_codec.rs
Normal file
29
crates/milli/src/heed_codec/beu16_str_codec.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
pub struct BEU16StrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for BEU16StrCodec {
|
||||
type DItem = (u16, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n_bytes, str_bytes) = bytes.split_at(2);
|
||||
let n = n_bytes.try_into().map(u16::from_be_bytes)?;
|
||||
let s = str::from_utf8(str_bytes)?;
|
||||
Ok((n, s))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for BEU16StrCodec {
|
||||
type EItem = (u16, &'a str);
|
||||
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s.len() + 2);
|
||||
bytes.extend_from_slice(&n.to_be_bytes());
|
||||
bytes.extend_from_slice(s.as_bytes());
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
29
crates/milli/src/heed_codec/beu32_str_codec.rs
Normal file
29
crates/milli/src/heed_codec/beu32_str_codec.rs
Normal file
|
@ -0,0 +1,29 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
pub struct BEU32StrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for BEU32StrCodec {
|
||||
type DItem = (u32, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n_bytes, str_bytes) = bytes.split_at(4);
|
||||
let n = n_bytes.try_into().map(u32::from_be_bytes)?;
|
||||
let s = str::from_utf8(str_bytes)?;
|
||||
Ok((n, s))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for BEU32StrCodec {
|
||||
type EItem = (u32, &'a str);
|
||||
|
||||
fn bytes_encode((n, s): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s.len() + 4);
|
||||
bytes.extend_from_slice(&n.to_be_bytes());
|
||||
bytes.extend_from_slice(s.as_bytes());
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
23
crates/milli/src/heed_codec/byte_slice_ref.rs
Normal file
23
crates/milli/src/heed_codec/byte_slice_ref.rs
Normal file
|
@ -0,0 +1,23 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `&[u8]`. Unlike `Bytes`, its `EItem` and `DItem` associated
|
||||
/// types are equivalent (= `&'a [u8]`) and these values can reside within another structure.
|
||||
pub struct BytesRefCodec;
|
||||
|
||||
impl<'a> BytesEncode<'a> for BytesRefCodec {
|
||||
type EItem = &'a [u8];
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BytesDecode<'a> for BytesRefCodec {
|
||||
type DItem = &'a [u8];
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(bytes)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
use std::borrow::Cow;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
use crate::heed_codec::SliceTooShortError;
|
||||
use crate::{try_split_array_at, DocumentId, FieldId};
|
||||
|
||||
pub struct FieldDocIdFacetCodec<C>(PhantomData<C>);
|
||||
|
||||
impl<'a, C> BytesDecode<'a> for FieldDocIdFacetCodec<C>
|
||||
where
|
||||
C: BytesDecode<'a>,
|
||||
{
|
||||
type DItem = (FieldId, DocumentId, C::DItem);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
|
||||
let (document_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let document_id = u32::from_be_bytes(document_id_bytes);
|
||||
|
||||
let value = C::bytes_decode(bytes)?;
|
||||
|
||||
Ok((field_id, document_id, value))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, C> BytesEncode<'a> for FieldDocIdFacetCodec<C>
|
||||
where
|
||||
C: BytesEncode<'a>,
|
||||
{
|
||||
type EItem = (FieldId, DocumentId, C::EItem);
|
||||
|
||||
fn bytes_encode(
|
||||
(field_id, document_id, value): &'a Self::EItem,
|
||||
) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(32);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes()); // 2 bytes
|
||||
bytes.extend_from_slice(&document_id.to_be_bytes()); // 4 bytes
|
||||
let value_bytes = C::bytes_encode(value)?;
|
||||
// variable length, if f64 -> 16 bytes, if string -> large, potentially
|
||||
bytes.extend_from_slice(&value_bytes);
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
123
crates/milli/src/heed_codec/facet/mod.rs
Normal file
123
crates/milli/src/heed_codec/facet/mod.rs
Normal file
|
@ -0,0 +1,123 @@
|
|||
mod field_doc_id_facet_codec;
|
||||
mod ordered_f64_codec;
|
||||
|
||||
use std::borrow::Cow;
|
||||
use std::convert::TryFrom;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use heed::types::DecodeIgnore;
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
pub use self::field_doc_id_facet_codec::FieldDocIdFacetCodec;
|
||||
pub use self::ordered_f64_codec::OrderedF64Codec;
|
||||
use super::StrRefCodec;
|
||||
use crate::{CboRoaringBitmapCodec, BEU16};
|
||||
|
||||
pub type FieldDocIdFacetF64Codec = FieldDocIdFacetCodec<OrderedF64Codec>;
|
||||
pub type FieldDocIdFacetStringCodec = FieldDocIdFacetCodec<StrRefCodec>;
|
||||
pub type FieldDocIdFacetIgnoreCodec = FieldDocIdFacetCodec<DecodeIgnore>;
|
||||
|
||||
pub type FieldIdCodec = BEU16;
|
||||
|
||||
/// Tries to split a slice in half at the given middle point,
|
||||
/// `None` if the slice is too short.
|
||||
pub fn try_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> {
|
||||
if slice.len() >= mid {
|
||||
Some(slice.split_at(mid))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// The key in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`]
|
||||
/// databases.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] // TODO: try removing PartialOrd and Ord
|
||||
pub struct FacetGroupKey<T> {
|
||||
pub field_id: u16,
|
||||
pub level: u8,
|
||||
pub left_bound: T,
|
||||
}
|
||||
|
||||
/// The value in the [`facet_id_string_docids` and `facet_id_f64_docids`][`Index::facet_id_string_docids`]
|
||||
/// databases.
|
||||
#[derive(Debug)]
|
||||
pub struct FacetGroupValue {
|
||||
pub size: u8,
|
||||
pub bitmap: RoaringBitmap,
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct FacetGroupLazyValue<'b> {
|
||||
pub size: u8,
|
||||
pub bitmap_bytes: &'b [u8],
|
||||
}
|
||||
|
||||
pub struct FacetGroupKeyCodec<T> {
|
||||
_phantom: PhantomData<T>,
|
||||
}
|
||||
|
||||
impl<'a, T> heed::BytesEncode<'a> for FacetGroupKeyCodec<T>
|
||||
where
|
||||
T: BytesEncode<'a>,
|
||||
T::EItem: Sized,
|
||||
{
|
||||
type EItem = FacetGroupKey<T::EItem>;
|
||||
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut v = vec![];
|
||||
v.extend_from_slice(&value.field_id.to_be_bytes());
|
||||
v.extend_from_slice(&[value.level]);
|
||||
|
||||
let bound = T::bytes_encode(&value.left_bound)?;
|
||||
v.extend_from_slice(&bound);
|
||||
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, T> heed::BytesDecode<'a> for FacetGroupKeyCodec<T>
|
||||
where
|
||||
T: BytesDecode<'a>,
|
||||
{
|
||||
type DItem = FacetGroupKey<T::DItem>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let fid = u16::from_be_bytes(<[u8; 2]>::try_from(&bytes[0..=1])?);
|
||||
let level = bytes[2];
|
||||
let bound = T::bytes_decode(&bytes[3..])?;
|
||||
Ok(FacetGroupKey { field_id: fid, level, left_bound: bound })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetGroupValueCodec;
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FacetGroupValueCodec {
|
||||
type EItem = FacetGroupValue;
|
||||
|
||||
fn bytes_encode(value: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut v = vec![value.size];
|
||||
CboRoaringBitmapCodec::serialize_into(&value.bitmap, &mut v);
|
||||
Ok(Cow::Owned(v))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetGroupValueCodec {
|
||||
type DItem = FacetGroupValue;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let size = bytes[0];
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&bytes[1..])?;
|
||||
Ok(FacetGroupValue { size, bitmap })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FacetGroupLazyValueCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FacetGroupLazyValueCodec {
|
||||
type DItem = FacetGroupLazyValue<'a>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(FacetGroupLazyValue { size: bytes[0], bitmap_bytes: &bytes[1..] })
|
||||
}
|
||||
}
|
45
crates/milli/src/heed_codec/facet/ordered_f64_codec.rs
Normal file
45
crates/milli/src/heed_codec/facet/ordered_f64_codec.rs
Normal file
|
@ -0,0 +1,45 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
use thiserror::Error;
|
||||
|
||||
use crate::facet::value_encoding::f64_into_bytes;
|
||||
use crate::heed_codec::SliceTooShortError;
|
||||
|
||||
pub struct OrderedF64Codec;
|
||||
|
||||
impl<'a> BytesDecode<'a> for OrderedF64Codec {
|
||||
type DItem = f64;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
if bytes.len() < 16 {
|
||||
Err(SliceTooShortError.into())
|
||||
} else {
|
||||
bytes[8..].try_into().map(f64::from_be_bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for OrderedF64Codec {
|
||||
type EItem = f64;
|
||||
|
||||
fn bytes_encode(f: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut buffer = [0u8; 16];
|
||||
|
||||
// write the globally ordered float
|
||||
let bytes = f64_into_bytes(*f).ok_or(InvalidGloballyOrderedFloatError { float: *f })?;
|
||||
buffer[..8].copy_from_slice(&bytes[..]);
|
||||
// Then the f64 value just to be able to read it back
|
||||
let bytes = f.to_be_bytes();
|
||||
buffer[8..16].copy_from_slice(&bytes[..]);
|
||||
|
||||
Ok(Cow::Owned(buffer.to_vec()))
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("the float {float} cannot be converted to a globally ordered representation")]
|
||||
pub struct InvalidGloballyOrderedFloatError {
|
||||
float: f64,
|
||||
}
|
30
crates/milli/src/heed_codec/field_id_word_count_codec.rs
Normal file
30
crates/milli/src/heed_codec/field_id_word_count_codec.rs
Normal file
|
@ -0,0 +1,30 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
use crate::{try_split_array_at, FieldId};
|
||||
|
||||
pub struct FieldIdWordCountCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for FieldIdWordCountCodec {
|
||||
type DItem = (FieldId, u8);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (field_id_bytes, bytes) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
let field_id = u16::from_be_bytes(field_id_bytes);
|
||||
let ([word_count], _nothing) = try_split_array_at(bytes).ok_or(SliceTooShortError)?;
|
||||
Ok((field_id, word_count))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for FieldIdWordCountCodec {
|
||||
type EItem = (FieldId, u8);
|
||||
|
||||
fn bytes_encode((field_id, word_count): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(2 + 1);
|
||||
bytes.extend_from_slice(&field_id.to_be_bytes());
|
||||
bytes.push(*word_count);
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
23
crates/milli/src/heed_codec/fst_set_codec.rs
Normal file
23
crates/milli/src/heed_codec/fst_set_codec.rs
Normal file
|
@ -0,0 +1,23 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use fst::Set;
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `Set<&[u8]>`.
|
||||
pub struct FstSetCodec;
|
||||
|
||||
impl<'a> BytesEncode<'a> for FstSetCodec {
|
||||
type EItem = Set<Vec<u8>>;
|
||||
|
||||
fn bytes_encode(item: &'a Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_fst().as_bytes()))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> BytesDecode<'a> for FstSetCodec {
|
||||
type DItem = Set<&'a [u8]>;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Set::new(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
39
crates/milli/src/heed_codec/mod.rs
Normal file
39
crates/milli/src/heed_codec/mod.rs
Normal file
|
@ -0,0 +1,39 @@
|
|||
mod beu16_str_codec;
|
||||
mod beu32_str_codec;
|
||||
mod byte_slice_ref;
|
||||
pub mod facet;
|
||||
mod field_id_word_count_codec;
|
||||
mod fst_set_codec;
|
||||
mod obkv_codec;
|
||||
mod roaring_bitmap;
|
||||
mod roaring_bitmap_length;
|
||||
mod str_beu32_codec;
|
||||
mod str_ref;
|
||||
mod str_str_u8_codec;
|
||||
|
||||
pub use byte_slice_ref::BytesRefCodec;
|
||||
use heed::BoxedError;
|
||||
pub use str_ref::StrRefCodec;
|
||||
use thiserror::Error;
|
||||
|
||||
pub use self::beu16_str_codec::BEU16StrCodec;
|
||||
pub use self::beu32_str_codec::BEU32StrCodec;
|
||||
pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
|
||||
pub use self::fst_set_codec::FstSetCodec;
|
||||
pub use self::obkv_codec::ObkvCodec;
|
||||
pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, RoaringBitmapCodec};
|
||||
pub use self::roaring_bitmap_length::{
|
||||
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
|
||||
};
|
||||
pub use self::str_beu32_codec::{StrBEU16Codec, StrBEU32Codec};
|
||||
pub use self::str_str_u8_codec::{U8StrStrCodec, UncheckedU8StrStrCodec};
|
||||
|
||||
pub trait BytesDecodeOwned {
|
||||
type DItem;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError>;
|
||||
}
|
||||
|
||||
#[derive(Error, Debug)]
|
||||
#[error("the slice is too short")]
|
||||
pub struct SliceTooShortError;
|
22
crates/milli/src/heed_codec/obkv_codec.rs
Normal file
22
crates/milli/src/heed_codec/obkv_codec.rs
Normal file
|
@ -0,0 +1,22 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
use obkv::{KvReaderU16, KvWriterU16};
|
||||
|
||||
pub struct ObkvCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for ObkvCodec {
|
||||
type DItem = &'a KvReaderU16;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok(KvReaderU16::from_slice(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for ObkvCodec {
|
||||
type EItem = KvWriterU16<Vec<u8>>;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
item.clone().into_inner().map(Cow::Owned).map_err(Into::into)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,50 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::mem::size_of;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct BoRoaringBitmapCodec;
|
||||
|
||||
impl BoRoaringBitmapCodec {
|
||||
pub fn serialize_into(bitmap: &RoaringBitmap, out: &mut Vec<u8>) {
|
||||
out.reserve(bitmap.len() as usize * size_of::<u32>());
|
||||
bitmap.iter().map(u32::to_ne_bytes).for_each(|bytes| out.extend_from_slice(&bytes));
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecode<'_> for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
|
||||
for chunk in bytes.chunks(size_of::<u32>()) {
|
||||
let bytes = chunk.try_into()?;
|
||||
bitmap.push(u32::from_ne_bytes(bytes));
|
||||
}
|
||||
|
||||
Ok(bitmap)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for BoRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut out = Vec::new();
|
||||
BoRoaringBitmapCodec::serialize_into(item, &mut out);
|
||||
Ok(Cow::Owned(out))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,247 @@
|
|||
use std::borrow::Cow;
|
||||
use std::io::{self, Cursor};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use heed::BoxedError;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
use crate::update::del_add::{DelAdd, KvReaderDelAdd};
|
||||
|
||||
/// This is the limit where using a byteorder became less size efficient
|
||||
/// than using a direct roaring encoding, it is also the point where we are able
|
||||
/// to determine the encoding used only by using the array of bytes length.
|
||||
pub const THRESHOLD: usize = 7;
|
||||
|
||||
/// A conditionnal codec that either use the RoaringBitmap
|
||||
/// or a lighter ByteOrder en/decoding method.
|
||||
pub struct CboRoaringBitmapCodec;
|
||||
|
||||
impl CboRoaringBitmapCodec {
|
||||
pub fn serialized_size(roaring: &RoaringBitmap) -> usize {
|
||||
if roaring.len() <= THRESHOLD as u64 {
|
||||
roaring.len() as usize * size_of::<u32>()
|
||||
} else {
|
||||
roaring.serialized_size()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn serialize_into(roaring: &RoaringBitmap, vec: &mut Vec<u8>) {
|
||||
if roaring.len() <= THRESHOLD as u64 {
|
||||
// If the number of items (u32s) to encode is less than or equal to the threshold
|
||||
// it means that it would weigh the same or less than the RoaringBitmap
|
||||
// header, so we directly encode them using ByteOrder instead.
|
||||
for integer in roaring {
|
||||
vec.write_u32::<NativeEndian>(integer).unwrap();
|
||||
}
|
||||
} else {
|
||||
// Otherwise, we use the classic RoaringBitmapCodec that writes a header.
|
||||
roaring.serialize_into(vec).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn deserialize_from(mut bytes: &[u8]) -> io::Result<RoaringBitmap> {
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
// If there is threshold or less than threshold integers that can fit into this array
|
||||
// of bytes it means that we used the ByteOrder codec serializer.
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||
bitmap.insert(integer);
|
||||
}
|
||||
Ok(bitmap)
|
||||
} else {
|
||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||
// that the header takes threshold integers.
|
||||
RoaringBitmap::deserialize_unchecked_from(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn intersection_with_serialized(
|
||||
mut bytes: &[u8],
|
||||
other: &RoaringBitmap,
|
||||
) -> io::Result<RoaringBitmap> {
|
||||
// See above `deserialize_from` method for implementation details.
|
||||
if bytes.len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut bitmap = RoaringBitmap::new();
|
||||
while let Ok(integer) = bytes.read_u32::<NativeEndian>() {
|
||||
if other.contains(integer) {
|
||||
bitmap.insert(integer);
|
||||
}
|
||||
}
|
||||
Ok(bitmap)
|
||||
} else {
|
||||
other.intersection_with_serialized_unchecked(Cursor::new(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
/// Merge serialized CboRoaringBitmaps in a buffer.
|
||||
///
|
||||
/// if the merged values length is under the threshold, values are directly
|
||||
/// serialized in the buffer else a RoaringBitmap is created from the
|
||||
/// values and is serialized in the buffer.
|
||||
pub fn merge_into<I, A>(slices: I, buffer: &mut Vec<u8>) -> io::Result<()>
|
||||
where
|
||||
I: IntoIterator<Item = A>,
|
||||
A: AsRef<[u8]>,
|
||||
{
|
||||
let mut roaring = RoaringBitmap::new();
|
||||
let mut vec = Vec::new();
|
||||
|
||||
for bytes in slices {
|
||||
if bytes.as_ref().len() <= THRESHOLD * size_of::<u32>() {
|
||||
let mut reader = bytes.as_ref();
|
||||
while let Ok(integer) = reader.read_u32::<NativeEndian>() {
|
||||
vec.push(integer);
|
||||
}
|
||||
} else {
|
||||
roaring |= RoaringBitmap::deserialize_unchecked_from(bytes.as_ref())?;
|
||||
}
|
||||
}
|
||||
|
||||
if roaring.is_empty() {
|
||||
vec.sort_unstable();
|
||||
vec.dedup();
|
||||
|
||||
if vec.len() <= THRESHOLD {
|
||||
for integer in vec {
|
||||
buffer.extend_from_slice(&integer.to_ne_bytes());
|
||||
}
|
||||
} else {
|
||||
// We can unwrap safely because the vector is sorted upper.
|
||||
let roaring = RoaringBitmap::from_sorted_iter(vec).unwrap();
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
} else {
|
||||
roaring.extend(vec);
|
||||
roaring.serialize_into(buffer)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Merges a DelAdd delta into a CboRoaringBitmap.
|
||||
pub fn merge_deladd_into<'a>(
|
||||
deladd: &KvReaderDelAdd,
|
||||
previous: &[u8],
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> io::Result<Option<&'a [u8]>> {
|
||||
// Deserialize the bitmap that is already there
|
||||
let mut previous = Self::deserialize_from(previous)?;
|
||||
|
||||
// Remove integers we no more want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Deletion) {
|
||||
previous -= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
// Insert the new integers we want in the previous bitmap
|
||||
if let Some(value) = deladd.get(DelAdd::Addition) {
|
||||
previous |= Self::deserialize_from(value)?;
|
||||
}
|
||||
|
||||
if previous.is_empty() {
|
||||
return Ok(None);
|
||||
}
|
||||
|
||||
Self::serialize_into(&previous, buffer);
|
||||
Ok(Some(&buffer[..]))
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for CboRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for CboRoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for CboRoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut vec = Vec::with_capacity(Self::serialized_size(item));
|
||||
Self::serialize_into(item, &mut vec);
|
||||
Ok(Cow::Owned(vec))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::iter::FromIterator;
|
||||
|
||||
use heed::{BytesDecode, BytesEncode};
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn verify_encoding_decoding() {
|
||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
||||
let bytes = CboRoaringBitmapCodec::bytes_encode(&input).unwrap();
|
||||
let output = CboRoaringBitmapCodec::bytes_decode(&bytes).unwrap();
|
||||
assert_eq!(input, output);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn verify_threshold() {
|
||||
let input = RoaringBitmap::from_iter(0..THRESHOLD as u32);
|
||||
|
||||
// use roaring bitmap
|
||||
let mut bytes = Vec::new();
|
||||
input.serialize_into(&mut bytes).unwrap();
|
||||
let roaring_size = bytes.len();
|
||||
|
||||
// use byteorder directly
|
||||
let mut bytes = Vec::new();
|
||||
for integer in input {
|
||||
bytes.write_u32::<NativeEndian>(integer).unwrap();
|
||||
}
|
||||
let bo_size = bytes.len();
|
||||
|
||||
assert!(roaring_size > bo_size);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn merge_cbo_roaring_bitmaps() {
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
let small_data = [
|
||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(4..6).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(1..3).unwrap(),
|
||||
];
|
||||
|
||||
let small_data: Vec<_> =
|
||||
small_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||
CboRoaringBitmapCodec::merge_into(small_data.as_slice(), &mut buffer).unwrap();
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||
let expected = RoaringBitmap::from_sorted_iter(1..6).unwrap();
|
||||
assert_eq!(bitmap, expected);
|
||||
|
||||
let medium_data = [
|
||||
RoaringBitmap::from_sorted_iter(1..4).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(2..5).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(4..8).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(0..3).unwrap(),
|
||||
RoaringBitmap::from_sorted_iter(7..23).unwrap(),
|
||||
];
|
||||
|
||||
let medium_data: Vec<_> =
|
||||
medium_data.iter().map(|b| CboRoaringBitmapCodec::bytes_encode(b).unwrap()).collect();
|
||||
buffer.clear();
|
||||
CboRoaringBitmapCodec::merge_into(medium_data.as_slice(), &mut buffer).unwrap();
|
||||
|
||||
let bitmap = CboRoaringBitmapCodec::deserialize_from(&buffer).unwrap();
|
||||
let expected = RoaringBitmap::from_sorted_iter(0..23).unwrap();
|
||||
assert_eq!(bitmap, expected);
|
||||
}
|
||||
}
|
7
crates/milli/src/heed_codec/roaring_bitmap/mod.rs
Normal file
7
crates/milli/src/heed_codec/roaring_bitmap/mod.rs
Normal file
|
@ -0,0 +1,7 @@
|
|||
mod bo_roaring_bitmap_codec;
|
||||
pub mod cbo_roaring_bitmap_codec;
|
||||
mod roaring_bitmap_codec;
|
||||
|
||||
pub use self::bo_roaring_bitmap_codec::BoRoaringBitmapCodec;
|
||||
pub use self::cbo_roaring_bitmap_codec::CboRoaringBitmapCodec;
|
||||
pub use self::roaring_bitmap_codec::RoaringBitmapCodec;
|
|
@ -0,0 +1,34 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::BoxedError;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct RoaringBitmapCodec;
|
||||
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmap::deserialize_unchecked_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapCodec {
|
||||
type DItem = RoaringBitmap;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmap::deserialize_from(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesEncode<'_> for RoaringBitmapCodec {
|
||||
type EItem = RoaringBitmap;
|
||||
|
||||
fn bytes_encode(item: &Self::EItem) -> Result<Cow<'_, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(item.serialized_size());
|
||||
item.serialize_into(&mut bytes)?;
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
use std::mem;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct BoRoaringBitmapLenCodec;
|
||||
|
||||
impl BytesDecode<'_> for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Ok((bytes.len() / mem::size_of::<u32>()) as u64)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for BoRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,33 @@
|
|||
use std::mem;
|
||||
|
||||
use heed::{BoxedError, BytesDecode};
|
||||
|
||||
use super::{BoRoaringBitmapLenCodec, RoaringBitmapLenCodec};
|
||||
use crate::heed_codec::roaring_bitmap::cbo_roaring_bitmap_codec::THRESHOLD;
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
pub struct CboRoaringBitmapLenCodec;
|
||||
|
||||
impl BytesDecode<'_> for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
if bytes.len() <= THRESHOLD * mem::size_of::<u32>() {
|
||||
// If there is threshold or less than threshold integers that can fit into this array
|
||||
// of bytes it means that we used the ByteOrder codec serializer.
|
||||
BoRoaringBitmapLenCodec::bytes_decode(bytes)
|
||||
} else {
|
||||
// Otherwise, it means we used the classic RoaringBitmapCodec and
|
||||
// that the header takes threshold integers.
|
||||
RoaringBitmapLenCodec::bytes_decode(bytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for CboRoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
Self::bytes_decode(bytes)
|
||||
}
|
||||
}
|
7
crates/milli/src/heed_codec/roaring_bitmap_length/mod.rs
Normal file
7
crates/milli/src/heed_codec/roaring_bitmap_length/mod.rs
Normal file
|
@ -0,0 +1,7 @@
|
|||
mod bo_roaring_bitmap_len_codec;
|
||||
mod cbo_roaring_bitmap_len_codec;
|
||||
mod roaring_bitmap_len_codec;
|
||||
|
||||
pub use self::bo_roaring_bitmap_len_codec::BoRoaringBitmapLenCodec;
|
||||
pub use self::cbo_roaring_bitmap_len_codec::CboRoaringBitmapLenCodec;
|
||||
pub use self::roaring_bitmap_len_codec::RoaringBitmapLenCodec;
|
|
@ -0,0 +1,88 @@
|
|||
use std::io::{self, BufRead, Read};
|
||||
use std::mem;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt};
|
||||
use heed::BoxedError;
|
||||
|
||||
use crate::heed_codec::BytesDecodeOwned;
|
||||
|
||||
const SERIAL_COOKIE_NO_RUNCONTAINER: u32 = 12346;
|
||||
const SERIAL_COOKIE: u16 = 12347;
|
||||
|
||||
pub struct RoaringBitmapLenCodec;
|
||||
|
||||
impl RoaringBitmapLenCodec {
|
||||
// FIXME should be exported in the RoaringBitmap crate
|
||||
fn deserialize_from_slice(mut bytes: &[u8]) -> io::Result<u64> {
|
||||
let (size, has_offsets) = {
|
||||
let cookie = bytes.read_u32::<LittleEndian>()?;
|
||||
if cookie == SERIAL_COOKIE_NO_RUNCONTAINER {
|
||||
(bytes.read_u32::<LittleEndian>()? as usize, true)
|
||||
} else if (cookie as u16) == SERIAL_COOKIE {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "run containers are unsupported"));
|
||||
} else {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "unknown cookie value"));
|
||||
}
|
||||
};
|
||||
|
||||
if size > u16::MAX as usize + 1 {
|
||||
return Err(io::Error::new(io::ErrorKind::Other, "size is greater than supported"));
|
||||
}
|
||||
|
||||
let mut description_bytes = vec![0u8; size * 4];
|
||||
bytes.read_exact(&mut description_bytes)?;
|
||||
let description_bytes = &mut &description_bytes[..];
|
||||
|
||||
if has_offsets {
|
||||
bytes.consume(size * 4);
|
||||
}
|
||||
|
||||
let mut length = 0;
|
||||
for _ in 0..size {
|
||||
let _key = description_bytes.read_u16::<LittleEndian>()?;
|
||||
let len = u64::from(description_bytes.read_u16::<LittleEndian>()?) + 1;
|
||||
length += len;
|
||||
|
||||
if len <= 4096 {
|
||||
bytes.consume(len as usize * mem::size_of::<u16>());
|
||||
} else {
|
||||
bytes.consume(1024 * mem::size_of::<u64>())
|
||||
}
|
||||
}
|
||||
|
||||
Ok(length)
|
||||
}
|
||||
}
|
||||
|
||||
impl heed::BytesDecode<'_> for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
impl BytesDecodeOwned for RoaringBitmapLenCodec {
|
||||
type DItem = u64;
|
||||
|
||||
fn bytes_decode_owned(bytes: &[u8]) -> Result<Self::DItem, BoxedError> {
|
||||
RoaringBitmapLenCodec::deserialize_from_slice(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use heed::BytesEncode;
|
||||
use roaring::RoaringBitmap;
|
||||
|
||||
use super::*;
|
||||
use crate::heed_codec::RoaringBitmapCodec;
|
||||
|
||||
#[test]
|
||||
fn deserialize_roaring_bitmap_length() {
|
||||
let bitmap: RoaringBitmap = (0..500).chain(800..800_000).chain(920_056..930_032).collect();
|
||||
let bytes = RoaringBitmapCodec::bytes_encode(&bitmap).unwrap();
|
||||
let len = RoaringBitmapLenCodec::deserialize_from_slice(&bytes).unwrap();
|
||||
assert_eq!(bitmap.len(), len);
|
||||
}
|
||||
}
|
79
crates/milli/src/heed_codec/str_beu32_codec.rs
Normal file
79
crates/milli/src/heed_codec/str_beu32_codec.rs
Normal file
|
@ -0,0 +1,79 @@
|
|||
use std::borrow::Cow;
|
||||
use std::convert::TryInto;
|
||||
use std::mem::size_of;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
|
||||
pub struct StrBEU32Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrBEU32Codec {
|
||||
type DItem = (&'a str, u32);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let footer_len = size_of::<u32>();
|
||||
|
||||
if bytes.len() < footer_len {
|
||||
return Err(SliceTooShortError.into());
|
||||
}
|
||||
|
||||
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
let word = str::from_utf8(word)?;
|
||||
let pos = bytes.try_into().map(u32::from_be_bytes)?;
|
||||
|
||||
Ok((word, pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
|
||||
type EItem = (&'a str, u32);
|
||||
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let pos = pos.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + pos.len());
|
||||
bytes.extend_from_slice(word.as_bytes());
|
||||
bytes.extend_from_slice(&pos[..]);
|
||||
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StrBEU16Codec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for StrBEU16Codec {
|
||||
type DItem = (&'a str, u16);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let footer_len = size_of::<u16>();
|
||||
|
||||
if bytes.len() < footer_len + 1 {
|
||||
return Err(SliceTooShortError.into());
|
||||
}
|
||||
|
||||
let (word_plus_nul_byte, bytes) = bytes.split_at(bytes.len() - footer_len);
|
||||
// unwrap: we just checked the footer + 1 above.
|
||||
let (_, word) = word_plus_nul_byte.split_last().unwrap();
|
||||
let word = str::from_utf8(word)?;
|
||||
let pos = bytes.try_into().map(u16::from_be_bytes)?;
|
||||
|
||||
Ok((word, pos))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for StrBEU16Codec {
|
||||
type EItem = (&'a str, u16);
|
||||
|
||||
fn bytes_encode((word, pos): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let pos = pos.to_be_bytes();
|
||||
|
||||
let mut bytes = Vec::with_capacity(word.len() + 1 + pos.len());
|
||||
bytes.extend_from_slice(word.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(&pos[..]);
|
||||
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
21
crates/milli/src/heed_codec/str_ref.rs
Normal file
21
crates/milli/src/heed_codec/str_ref.rs
Normal file
|
@ -0,0 +1,21 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use heed::{BoxedError, BytesDecode, BytesEncode};
|
||||
|
||||
/// A codec for values of type `&str`. Unlike `Str`, its `EItem` and `DItem` associated
|
||||
/// types are equivalent (= `&'a str`) and these values can reside within another structure.
|
||||
pub struct StrRefCodec;
|
||||
impl<'a> BytesEncode<'a> for StrRefCodec {
|
||||
type EItem = &'a str;
|
||||
|
||||
fn bytes_encode(item: &'a &'a str) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
Ok(Cow::Borrowed(item.as_bytes()))
|
||||
}
|
||||
}
|
||||
impl<'a> BytesDecode<'a> for StrRefCodec {
|
||||
type DItem = &'a str;
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
std::str::from_utf8(bytes).map_err(Into::into)
|
||||
}
|
||||
}
|
62
crates/milli/src/heed_codec/str_str_u8_codec.rs
Normal file
62
crates/milli/src/heed_codec/str_str_u8_codec.rs
Normal file
|
@ -0,0 +1,62 @@
|
|||
use std::borrow::Cow;
|
||||
use std::ffi::CStr;
|
||||
use std::str;
|
||||
|
||||
use heed::BoxedError;
|
||||
|
||||
use super::SliceTooShortError;
|
||||
|
||||
pub struct U8StrStrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for U8StrStrCodec {
|
||||
type DItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
|
||||
let cstr = CStr::from_bytes_until_nul(bytes)?;
|
||||
let s1 = cstr.to_str()?;
|
||||
// skip '\0' byte between the two strings.
|
||||
let s2 = str::from_utf8(&bytes[s1.len() + 1..])?;
|
||||
Ok((*n, s1, s2))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for U8StrStrCodec {
|
||||
type EItem = (u8, &'a str, &'a str);
|
||||
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1.as_bytes());
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2.as_bytes());
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
||||
pub struct UncheckedU8StrStrCodec;
|
||||
|
||||
impl<'a> heed::BytesDecode<'a> for UncheckedU8StrStrCodec {
|
||||
type DItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
|
||||
let (n, bytes) = bytes.split_first().ok_or(SliceTooShortError)?;
|
||||
let cstr = CStr::from_bytes_until_nul(bytes)?;
|
||||
let s1_bytes = cstr.to_bytes();
|
||||
// skip '\0' byte between the two strings.
|
||||
let s2_bytes = &bytes[s1_bytes.len() + 1..];
|
||||
Ok((*n, s1_bytes, s2_bytes))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> heed::BytesEncode<'a> for UncheckedU8StrStrCodec {
|
||||
type EItem = (u8, &'a [u8], &'a [u8]);
|
||||
|
||||
fn bytes_encode((n, s1, s2): &Self::EItem) -> Result<Cow<'a, [u8]>, BoxedError> {
|
||||
let mut bytes = Vec::with_capacity(s1.len() + s2.len() + 1);
|
||||
bytes.push(*n);
|
||||
bytes.extend_from_slice(s1);
|
||||
bytes.push(0);
|
||||
bytes.extend_from_slice(s2);
|
||||
Ok(Cow::Owned(bytes))
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue