Simplify word level position DB into a word position DB

This commit is contained in:
many 2021-10-05 11:18:42 +02:00
parent 75d341d928
commit 3296bb243c
No known key found for this signature in database
GPG key ID: 2CEF23B75189EACA
18 changed files with 220 additions and 545 deletions

View file

@ -4,7 +4,7 @@ mod field_id_word_count_codec;
mod obkv_codec;
mod roaring_bitmap;
mod roaring_bitmap_length;
mod str_level_position_codec;
mod str_beu32_codec;
mod str_str_u8_codec;
pub use self::beu32_str_codec::BEU32StrCodec;
@ -14,5 +14,5 @@ pub use self::roaring_bitmap::{BoRoaringBitmapCodec, CboRoaringBitmapCodec, Roar
pub use self::roaring_bitmap_length::{
BoRoaringBitmapLenCodec, CboRoaringBitmapLenCodec, RoaringBitmapLenCodec,
};
pub use self::str_level_position_codec::StrLevelPositionCodec;
pub use self::str_beu32_codec::StrBEU32Codec;
pub use self::str_str_u8_codec::StrStrU8Codec;

View file

@ -0,0 +1,38 @@
use std::borrow::Cow;
use std::convert::TryInto;
use std::mem::size_of;
use std::str;
pub struct StrBEU32Codec;
impl<'a> heed::BytesDecode<'a> for StrBEU32Codec {
type DItem = (&'a str, u32);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u32>();
if bytes.len() < footer_len {
return None;
}
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
let word = str::from_utf8(word).ok()?;
let pos = bytes.try_into().map(u32::from_be_bytes).ok()?;
Some((word, pos))
}
}
impl<'a> heed::BytesEncode<'a> for StrBEU32Codec {
type EItem = (&'a str, u32);
fn bytes_encode((word, pos): &Self::EItem) -> Option<Cow<[u8]>> {
let pos = pos.to_be_bytes();
let mut bytes = Vec::with_capacity(word.len() + pos.len());
bytes.extend_from_slice(word.as_bytes());
bytes.extend_from_slice(&pos[..]);
Some(Cow::Owned(bytes))
}
}

View file

@ -1,47 +0,0 @@
use std::borrow::Cow;
use std::convert::{TryFrom, TryInto};
use std::mem::size_of;
use std::str;
use crate::TreeLevel;
pub struct StrLevelPositionCodec;
impl<'a> heed::BytesDecode<'a> for StrLevelPositionCodec {
type DItem = (&'a str, TreeLevel, u32, u32);
fn bytes_decode(bytes: &'a [u8]) -> Option<Self::DItem> {
let footer_len = size_of::<u8>() + size_of::<u32>() * 2;
if bytes.len() < footer_len {
return None;
}
let (word, bytes) = bytes.split_at(bytes.len() - footer_len);
let word = str::from_utf8(word).ok()?;
let (level, bytes) = bytes.split_first()?;
let left = bytes[..4].try_into().map(u32::from_be_bytes).ok()?;
let right = bytes[4..].try_into().map(u32::from_be_bytes).ok()?;
let level = TreeLevel::try_from(*level).ok()?;
Some((word, level, left, right))
}
}
impl<'a> heed::BytesEncode<'a> for StrLevelPositionCodec {
type EItem = (&'a str, TreeLevel, u32, u32);
fn bytes_encode((word, level, left, right): &Self::EItem) -> Option<Cow<[u8]>> {
let left = left.to_be_bytes();
let right = right.to_be_bytes();
let mut bytes = Vec::with_capacity(word.len() + 1 + left.len() + right.len());
bytes.extend_from_slice(word.as_bytes());
bytes.push((*level).into());
bytes.extend_from_slice(&left[..]);
bytes.extend_from_slice(&right[..]);
Some(Cow::Owned(bytes))
}
}