Generate the dictionary from the first 10k documents

2025-07-01 10:58:30 +02:00 · 2024-07-02 15:49:56 +02:00 · 2024-07-02 15:49:56 +02:00 · 767f20e30d
commit 767f20e30d
parent 0d63d02ab2
4 changed files with 45 additions and 12 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -3563,6 +3563,7 @@ dependencies = [
 "ureq",
 "url",
 "uuid",
+ "zstd 0.11.2+zstd.1.5.2",
 ]

 [[package]]
@ -6406,7 +6407,7 @@ dependencies = [
 "time",
 "zeroize",
 "zopfli",
- "zstd",
+ "zstd 0.13.2",
 ]

 [[package]]
@ -6423,13 +6424,32 @@ dependencies = [
 "simd-adler32",
 ]

+[[package]]
+name = "zstd"
+version = "0.11.2+zstd.1.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
+dependencies = [
+ "zstd-safe 5.0.2+zstd.1.5.2",
+]
+
 [[package]]
 name = "zstd"
 version = "0.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
 dependencies = [
- "zstd-safe",
+ "zstd-safe 7.2.0",
+]
+
+[[package]]
+name = "zstd-safe"
+version = "5.0.2+zstd.1.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
+dependencies = [
+ "libc",
+ "zstd-sys",
 ]

 [[package]]
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -39,6 +39,7 @@ indexmap = { version = "2.2.6", features = ["serde"] }
 json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
 lz4_flex = "0.11.3"
+zstd = { version = "0.11.2", features = ["zdict_builder"] }
 memmap2 = "0.9.4"
 obkv = "0.2.2"
 once_cell = "1.19.0"
--- a/milli/src/heed_codec/compressed_obkv_codec.rs
+++ b/milli/src/heed_codec/compressed_obkv_codec.rs
@ -28,13 +28,13 @@ impl<'a> CompressedKvReaderU16<'a> {
    pub fn decompress_with<'b>(
        &self,
        buffer: &'b mut Vec<u8>,
-        dictionnary: &[u8],
+        dictionary: &[u8],
    ) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
        let (size, input) = lz4_flex::block::uncompressed_size(self.0)?;
        buffer.resize(size, 0);
        // TODO loop to increase the buffer size of need be
        let size =
-            lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionnary)?;
+            lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionary)?;
        Ok(KvReaderU16::new(&buffer[..size]))
    }

--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -5,7 +5,7 @@ mod transform;
 mod typed_chunk;

 use std::collections::{HashMap, HashSet};
-use std::io::{Read, Seek};
+use std::io::{BufWriter, Read, Seek, Write};
 use std::iter;
 use std::num::NonZeroU32;
 use std::result::Result as StdResult;
@ -41,7 +41,7 @@ use crate::update::{
    IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
 };
 use crate::vector::EmbeddingConfigs;
-use crate::{CboRoaringBitmapCodec, Index, Result};
+use crate::{CboRoaringBitmapCodec, Index, Result, BEU32};

 static MERGED_DATABASE_COUNT: usize = 7;
 static PREFIX_DATABASE_COUNT: usize = 4;
@ -568,7 +568,7 @@ where

        // TODO increase this number to 10k and put it in a const somewhere
        //      I don't like that this dangerous condition is here...
-        if number_of_documents > 1_000
+        if number_of_documents > 10_000
            && self.index.document_compression_dictionary(self.wtxn)?.is_none()
        {
            self.manage_compression_dictionary()?;
@ -767,17 +767,29 @@ where
        name = "compress_documents_database"
    )]
    pub fn manage_compression_dictionary(&mut self) -> Result<()> {
-        // TODO This is a dumb dictionary, just so you get the idea.
-        //      We need to compute a better one by using zstd or something else.
-        let dictionary = b"movietraileradventurehorror";
-        self.index.put_document_compression_dictionary(self.wtxn, dictionary)?;
+        let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
+        let mut sample_sizes = Vec::new();
+        // TODO make this 1_000 be 10k and const
+        let documents = self.index.documents.remap_types::<BEU32, Bytes>();
+        for result in documents.iter(self.wtxn)?.take(10_000) {
+            let (_id, bytes) = result?;
+            sample_file.write_all(bytes)?;
+            sample_sizes.push(bytes.len());
+        }
+
+        // TODO manage this unwrap correctly
+        let sample_file = sample_file.into_inner().unwrap();
+        let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
+        // TODO make this 64_000 const
+        let dictionary = zstd::dict::from_continuous(&sample_data, &sample_sizes, 64_000)?;
+        self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;

        // TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
        let mut iter = self.index.documents.remap_data_type::<Bytes>().iter_mut(self.wtxn)?;
        while let Some(result) = iter.next() {
            let (docid, document) = result?;
            // TODO manage this unwrap correctly
-            let compressed = CompressedKvWriterU16::new_with_dictionary(document, dictionary);
+            let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary);
            // safety the compressed document is entirely owned
            unsafe {
                iter.put_current_with_options::<CompressedObkvCodec>(