From 767f20e30dcbd1b94289e795a42bfcf062ca1147 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 2 Jul 2024 15:49:56 +0200 Subject: [PATCH] Generate the dictionary from the first 10k documents --- Cargo.lock | 24 ++++++++++++++-- milli/Cargo.toml | 1 + milli/src/heed_codec/compressed_obkv_codec.rs | 4 +-- milli/src/update/index_documents/mod.rs | 28 +++++++++++++------ 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1ef39db61..d71ed1bc0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3563,6 +3563,7 @@ dependencies = [ "ureq", "url", "uuid", + "zstd 0.11.2+zstd.1.5.2", ] [[package]] @@ -6406,7 +6407,7 @@ dependencies = [ "time", "zeroize", "zopfli", - "zstd", + "zstd 0.13.2", ] [[package]] @@ -6423,13 +6424,32 @@ dependencies = [ "simd-adler32", ] +[[package]] +name = "zstd" +version = "0.11.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +dependencies = [ + "zstd-safe 5.0.2+zstd.1.5.2", +] + [[package]] name = "zstd" version = "0.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9" dependencies = [ - "zstd-safe", + "zstd-safe 7.2.0", +] + +[[package]] +name = "zstd-safe" +version = "5.0.2+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +dependencies = [ + "libc", + "zstd-sys", ] [[package]] diff --git a/milli/Cargo.toml b/milli/Cargo.toml index a670e43b1..61bc5eddf 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -39,6 +39,7 @@ indexmap = { version = "2.2.6", features = ["serde"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } lz4_flex = "0.11.3" +zstd = { version = "0.11.2", features = ["zdict_builder"] } memmap2 = "0.9.4" obkv = "0.2.2" once_cell = "1.19.0" diff --git a/milli/src/heed_codec/compressed_obkv_codec.rs b/milli/src/heed_codec/compressed_obkv_codec.rs index d6ec8e717..4abcbb84d 100644 --- a/milli/src/heed_codec/compressed_obkv_codec.rs +++ b/milli/src/heed_codec/compressed_obkv_codec.rs @@ -28,13 +28,13 @@ impl<'a> CompressedKvReaderU16<'a> { pub fn decompress_with<'b>( &self, buffer: &'b mut Vec, - dictionnary: &[u8], + dictionary: &[u8], ) -> Result, lz4_flex::block::DecompressError> { let (size, input) = lz4_flex::block::uncompressed_size(self.0)?; buffer.resize(size, 0); // TODO loop to increase the buffer size of need be let size = - lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionnary)?; + lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionary)?; Ok(KvReaderU16::new(&buffer[..size])) } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 0fe333e8b..a9a1ebc43 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -5,7 +5,7 @@ mod transform; mod typed_chunk; use std::collections::{HashMap, HashSet}; -use std::io::{Read, Seek}; +use std::io::{BufWriter, Read, Seek, Write}; use std::iter; use std::num::NonZeroU32; use std::result::Result as StdResult; @@ -41,7 +41,7 @@ use crate::update::{ IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst, }; use crate::vector::EmbeddingConfigs; -use crate::{CboRoaringBitmapCodec, Index, Result}; +use crate::{CboRoaringBitmapCodec, Index, Result, BEU32}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 4; @@ -568,7 +568,7 @@ where // TODO increase this number to 10k and put it in a const somewhere // I don't like that this dangerous condition is here... - if number_of_documents > 1_000 + if number_of_documents > 10_000 && self.index.document_compression_dictionary(self.wtxn)?.is_none() { self.manage_compression_dictionary()?; @@ -767,17 +767,29 @@ where name = "compress_documents_database" )] pub fn manage_compression_dictionary(&mut self) -> Result<()> { - // TODO This is a dumb dictionary, just so you get the idea. - // We need to compute a better one by using zstd or something else. - let dictionary = b"movietraileradventurehorror"; - self.index.put_document_compression_dictionary(self.wtxn, dictionary)?; + let mut sample_file = tempfile::tempfile().map(BufWriter::new)?; + let mut sample_sizes = Vec::new(); + // TODO make this 1_000 be 10k and const + let documents = self.index.documents.remap_types::(); + for result in documents.iter(self.wtxn)?.take(10_000) { + let (_id, bytes) = result?; + sample_file.write_all(bytes)?; + sample_sizes.push(bytes.len()); + } + + // TODO manage this unwrap correctly + let sample_file = sample_file.into_inner().unwrap(); + let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? }; + // TODO make this 64_000 const + let dictionary = zstd::dict::from_continuous(&sample_data, &sample_sizes, 64_000)?; + self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?; // TODO do not remap types here but rather expose the &[u8] for the KvReaderU16 let mut iter = self.index.documents.remap_data_type::().iter_mut(self.wtxn)?; while let Some(result) = iter.next() { let (docid, document) = result?; // TODO manage this unwrap correctly - let compressed = CompressedKvWriterU16::new_with_dictionary(document, dictionary); + let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary); // safety the compressed document is entirely owned unsafe { iter.put_current_with_options::(