diff --git a/Cargo.lock b/Cargo.lock index 05bd80213..1ef39db61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3268,6 +3268,15 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "lz4_flex" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75761162ae2b0e580d7e7c390558127e5f01b4194debd6221fd8c207fc80e3f5" +dependencies = [ + "twox-hash", +] + [[package]] name = "lzma-rs" version = "0.3.0" @@ -3526,6 +3535,7 @@ dependencies = [ "json-depth-checker", "levenshtein_automata", "liquid", + "lz4_flex", "maplit", "md5", "meili-snap", @@ -5618,6 +5628,16 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + [[package]] name = "typenum" version = "1.17.0" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 64f0116d3..a670e43b1 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -38,6 +38,7 @@ heed = { version = "0.20.3", default-features = false, features = [ indexmap = { version = "2.2.6", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } +lz4_flex = "0.11.3" memmap2 = "0.9.4" obkv = "0.2.2" once_cell = "1.19.0" diff --git a/milli/src/heed_codec/compressed_obkv_codec.rs b/milli/src/heed_codec/compressed_obkv_codec.rs new file mode 100644 index 000000000..0fddc40d3 --- /dev/null +++ b/milli/src/heed_codec/compressed_obkv_codec.rs @@ -0,0 +1,50 @@ +use std::borrow::Cow; + +use heed::BoxedError; +use obkv::KvReaderU16; + +pub struct ObkvCompressedCodec; + +impl<'a> heed::BytesDecode<'a> for ObkvCompressedCodec { + type DItem = CompressedKvReaderU16<'a>; + + fn bytes_decode(bytes: &'a [u8]) -> Result { + Ok(CompressedKvReaderU16(bytes)) + } +} + +impl heed::BytesEncode<'_> for ObkvCompressedCodec { + type EItem = CompressedKvWriterU16; + + fn bytes_encode(item: &Self::EItem) -> Result, BoxedError> { + Ok(Cow::Borrowed(&item.0)) + } +} + +pub struct CompressedKvReaderU16<'a>(&'a [u8]); + +impl<'a> CompressedKvReaderU16<'a> { + pub fn decompress_with<'b>( + &self, + buffer: &'b mut Vec, + dictionnary: &[u8], + ) -> Result, lz4_flex::block::DecompressError> { + let max_size = lz4_flex::block::get_maximum_output_size(self.0.len()); + buffer.resize(max_size, 0); + let size = lz4_flex::block::decompress_into_with_dict( + self.0, + &mut buffer[..max_size], + dictionnary, + )?; + Ok(KvReaderU16::new(&buffer[..size])) + } +} + +pub struct CompressedKvWriterU16(Vec); + +impl CompressedKvWriterU16 { + // TODO ask for a KvReaderU16 here + pub fn new_with_dictionnary(writer: &[u8], dictionnary: &[u8]) -> Self { + CompressedKvWriterU16(lz4_flex::block::compress_with_dict(writer, dictionnary)) + } +} diff --git a/milli/src/heed_codec/mod.rs b/milli/src/heed_codec/mod.rs index 449d1955c..908a86a29 100644 --- a/milli/src/heed_codec/mod.rs +++ b/milli/src/heed_codec/mod.rs @@ -1,6 +1,7 @@ mod beu16_str_codec; mod beu32_str_codec; mod byte_slice_ref; +mod compressed_obkv_codec; pub mod facet; mod field_id_word_count_codec; mod fst_set_codec;