Generate the dictionary from the first 10k documents

This commit is contained in:
Clément Renault 2024-07-02 15:49:56 +02:00
parent 0d63d02ab2
commit 767f20e30d
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
4 changed files with 45 additions and 12 deletions

24
Cargo.lock generated
View File

@ -3563,6 +3563,7 @@ dependencies = [
"ureq",
"url",
"uuid",
"zstd 0.11.2+zstd.1.5.2",
]
[[package]]
@ -6406,7 +6407,7 @@ dependencies = [
"time",
"zeroize",
"zopfli",
"zstd",
"zstd 0.13.2",
]
[[package]]
@ -6423,13 +6424,32 @@ dependencies = [
"simd-adler32",
]
[[package]]
name = "zstd"
version = "0.11.2+zstd.1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
dependencies = [
"zstd-safe 5.0.2+zstd.1.5.2",
]
[[package]]
name = "zstd"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
dependencies = [
"zstd-safe",
"zstd-safe 7.2.0",
]
[[package]]
name = "zstd-safe"
version = "5.0.2+zstd.1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
dependencies = [
"libc",
"zstd-sys",
]
[[package]]

View File

@ -39,6 +39,7 @@ indexmap = { version = "2.2.6", features = ["serde"] }
json-depth-checker = { path = "../json-depth-checker" }
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
lz4_flex = "0.11.3"
zstd = { version = "0.11.2", features = ["zdict_builder"] }
memmap2 = "0.9.4"
obkv = "0.2.2"
once_cell = "1.19.0"

View File

@ -28,13 +28,13 @@ impl<'a> CompressedKvReaderU16<'a> {
pub fn decompress_with<'b>(
&self,
buffer: &'b mut Vec<u8>,
dictionnary: &[u8],
dictionary: &[u8],
) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
let (size, input) = lz4_flex::block::uncompressed_size(self.0)?;
buffer.resize(size, 0);
// TODO loop to increase the buffer size of need be
let size =
lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionnary)?;
lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionary)?;
Ok(KvReaderU16::new(&buffer[..size]))
}

View File

@ -5,7 +5,7 @@ mod transform;
mod typed_chunk;
use std::collections::{HashMap, HashSet};
use std::io::{Read, Seek};
use std::io::{BufWriter, Read, Seek, Write};
use std::iter;
use std::num::NonZeroU32;
use std::result::Result as StdResult;
@ -41,7 +41,7 @@ use crate::update::{
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
};
use crate::vector::EmbeddingConfigs;
use crate::{CboRoaringBitmapCodec, Index, Result};
use crate::{CboRoaringBitmapCodec, Index, Result, BEU32};
static MERGED_DATABASE_COUNT: usize = 7;
static PREFIX_DATABASE_COUNT: usize = 4;
@ -568,7 +568,7 @@ where
// TODO increase this number to 10k and put it in a const somewhere
// I don't like that this dangerous condition is here...
if number_of_documents > 1_000
if number_of_documents > 10_000
&& self.index.document_compression_dictionary(self.wtxn)?.is_none()
{
self.manage_compression_dictionary()?;
@ -767,17 +767,29 @@ where
name = "compress_documents_database"
)]
pub fn manage_compression_dictionary(&mut self) -> Result<()> {
// TODO This is a dumb dictionary, just so you get the idea.
// We need to compute a better one by using zstd or something else.
let dictionary = b"movietraileradventurehorror";
self.index.put_document_compression_dictionary(self.wtxn, dictionary)?;
let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
let mut sample_sizes = Vec::new();
// TODO make this 1_000 be 10k and const
let documents = self.index.documents.remap_types::<BEU32, Bytes>();
for result in documents.iter(self.wtxn)?.take(10_000) {
let (_id, bytes) = result?;
sample_file.write_all(bytes)?;
sample_sizes.push(bytes.len());
}
// TODO manage this unwrap correctly
let sample_file = sample_file.into_inner().unwrap();
let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
// TODO make this 64_000 const
let dictionary = zstd::dict::from_continuous(&sample_data, &sample_sizes, 64_000)?;
self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;
// TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
let mut iter = self.index.documents.remap_data_type::<Bytes>().iter_mut(self.wtxn)?;
while let Some(result) = iter.next() {
let (docid, document) = result?;
// TODO manage this unwrap correctly
let compressed = CompressedKvWriterU16::new_with_dictionary(document, dictionary);
let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary);
// safety the compressed document is entirely owned
unsafe {
iter.put_current_with_options::<CompressedObkvCodec>(