First version compressing the documents

2025-07-01 19:08:29 +02:00 · 2024-07-02 12:11:26 +02:00 · 2024-07-02 12:11:26 +02:00 · bf5d9f68fa
commit bf5d9f68fa
parent e9d6b4222b
6 changed files with 81 additions and 14 deletions
--- a/milli/src/heed_codec/compressed_obkv_codec.rs
+++ b/milli/src/heed_codec/compressed_obkv_codec.rs
@ -3,9 +3,9 @@ use std::borrow::Cow;
 use heed::BoxedError;
 use obkv::KvReaderU16;

-pub struct ObkvCompressedCodec;
+pub struct CompressedObkvCodec;

-impl<'a> heed::BytesDecode<'a> for ObkvCompressedCodec {
+impl<'a> heed::BytesDecode<'a> for CompressedObkvCodec {
    type DItem = CompressedKvReaderU16<'a>;

    fn bytes_decode(bytes: &'a [u8]) -> Result<Self::DItem, BoxedError> {
@ -13,7 +13,7 @@ impl<'a> heed::BytesDecode<'a> for ObkvCompressedCodec {
    }
 }

-impl heed::BytesEncode<'_> for ObkvCompressedCodec {
+impl heed::BytesEncode<'_> for CompressedObkvCodec {
    type EItem = CompressedKvWriterU16;

    fn bytes_encode(item: &Self::EItem) -> Result<Cow<[u8]>, BoxedError> {
@ -30,8 +30,10 @@ impl<'a> CompressedKvReaderU16<'a> {
        buffer: &'b mut Vec<u8>,
        dictionnary: &[u8],
    ) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
-        let max_size = lz4_flex::block::get_maximum_output_size(self.0.len());
+        // TODO WHAT THE HECK!!! WHY DO I NEED TO INCREASE THE SIZE PROVIDED
+        let max_size = lz4_flex::block::get_maximum_output_size(self.0.len()) * 2;
        buffer.resize(max_size, 0);
+        // TODO loop to increase the buffer size of need be
        let size = lz4_flex::block::decompress_into_with_dict(
            self.0,
            &mut buffer[..max_size],
@ -50,7 +52,11 @@ pub struct CompressedKvWriterU16(Vec<u8>);

 impl CompressedKvWriterU16 {
    // TODO ask for a KvReaderU16 here
-    pub fn new_with_dictionnary(writer: &[u8], dictionnary: &[u8]) -> Self {
-        CompressedKvWriterU16(lz4_flex::block::compress_with_dict(writer, dictionnary))
+    pub fn new_with_dictionary(writer: &[u8], dictionary: &[u8]) -> Self {
+        CompressedKvWriterU16(lz4_flex::block::compress_with_dict(writer, dictionary))
+    }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.0
    }
 }
--- a/milli/src/heed_codec/mod.rs
+++ b/milli/src/heed_codec/mod.rs
@ -21,7 +21,7 @@ use thiserror::Error;
 pub use self::beu16_str_codec::BEU16StrCodec;
 pub use self::beu32_str_codec::BEU32StrCodec;
 pub use self::compressed_obkv_codec::{
-    CompressedKvReaderU16, CompressedKvWriterU16, ObkvCompressedCodec,
+    CompressedKvReaderU16, CompressedKvWriterU16, CompressedObkvCodec,
 };
 pub use self::field_id_word_count_codec::FieldIdWordCountCodec;
 pub use self::fst_set_codec::FstSetCodec;
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -20,7 +20,7 @@ use crate::heed_codec::facet::{
    FieldIdCodec, OrderedF64Codec,
 };
 use crate::heed_codec::{
-    BEU16StrCodec, CompressedKvReaderU16, FstSetCodec, ObkvCompressedCodec, ScriptLanguageCodec,
+    BEU16StrCodec, CompressedKvReaderU16, CompressedObkvCodec, FstSetCodec, ScriptLanguageCodec,
    StrBEU16Codec, StrRefCodec,
 };
 use crate::order_by_map::OrderByMap;
@ -174,7 +174,7 @@ pub struct Index {
    pub vector_arroy: arroy::Database<arroy::distances::Angular>,

    /// Maps the document id to the document as an obkv store.
-    pub(crate) documents: Database<BEU32, ObkvCompressedCodec>,
+    pub(crate) documents: Database<BEU32, CompressedObkvCodec>,
 }

 impl Index {
@ -356,6 +356,11 @@ impl Index {
        )
    }

+    /// Deletes the document compression dictionary.
+    pub fn delete_document_compression_dictionary(&self, wtxn: &mut RwTxn) -> heed::Result<bool> {
+        self.main.remap_key_type::<Str>().delete(wtxn, main_key::DOCUMENT_COMPRESSION_DICTIONARY)
+    }
+
    /// Returns the optional dictionnary to be used when reading the OBKV documents.
    pub fn document_compression_dictionary<'t>(
        &self,
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@ -63,6 +63,7 @@ impl<'t, 'i> ClearDocuments<'t, 'i> {
        self.index.put_field_distribution(self.wtxn, &FieldDistribution::default())?;
        self.index.delete_geo_rtree(self.wtxn)?;
        self.index.delete_geo_faceted_documents_ids(self.wtxn)?;
+        self.index.delete_document_compression_dictionary(self.wtxn)?;

        // Remove all user-provided bits from the configs
        let mut configs = self.index.embedding_configs(self.wtxn)?;
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -13,8 +13,8 @@ use std::sync::Arc;

 use crossbeam_channel::{Receiver, Sender};
 use grenad::{Merger, MergerBuilder};
-use heed::types::Str;
-use heed::Database;
+use heed::types::{Bytes, Str};
+use heed::{Database, PutFlags};
 use rand::SeedableRng;
 use roaring::RoaringBitmap;
 use serde::{Deserialize, Serialize};
@ -34,6 +34,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
 use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
+use crate::heed_codec::{CompressedKvWriterU16, CompressedObkvCodec};
 use crate::thread_pool_no_abort::ThreadPoolNoAbortBuilder;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
@ -266,7 +267,7 @@ where
        target = "indexing::details",
        name = "index_documents_raw"
    )]
-    pub fn execute_raw(self, output: TransformOutput) -> Result<u64>
+    pub fn execute_raw(mut self, output: TransformOutput) -> Result<u64>
    where
        FP: Fn(UpdateIndexingStep) + Sync,
        FA: Fn() -> bool + Sync,
@ -565,6 +566,14 @@ where
            word_fid_docids.map(MergerBuilder::build),
        )?;

+        // TODO increase this number to 10k and put it in a const somewhere
+        //      I don't like that this dangerous condition is here...
+        if number_of_documents > 1_000
+            && self.index.document_compression_dictionary(self.wtxn)?.is_none()
+        {
+            self.manage_compression_dictionary()?;
+        }
+
        Ok(number_of_documents)
    }

@ -575,7 +584,7 @@ where
        name = "index_documents_prefix_databases"
    )]
    pub fn execute_prefix_databases(
-        self,
+        &mut self,
        word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
        exact_word_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
        word_position_docids: Option<Merger<CursorClonableMmap, MergeFn>>,
@ -747,6 +756,40 @@ where

        Ok(())
    }
+
+    /// Computes a new dictionay and compress the documents with it in the database.
+    ///
+    /// Documents still need to be directly compressed when being written in the database and a dictionary exists.
+    #[tracing::instrument(
+        level = "trace",
+        skip_all,
+        target = "indexing::compression",
+        name = "compress_documents_database"
+    )]
+    pub fn manage_compression_dictionary(&mut self) -> Result<()> {
+        // TODO This is a dumb dictionary, just so you get the idea.
+        //      We need to compute a better one by using zstd or something else.
+        let dictionary = b"movietraileradventurehorror";
+        self.index.put_document_compression_dictionary(self.wtxn, dictionary)?;
+
+        // TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
+        let mut iter = self.index.documents.remap_data_type::<Bytes>().iter_mut(self.wtxn)?;
+        while let Some(result) = iter.next() {
+            let (docid, document) = result?;
+            // TODO manage this unwrap correctly
+            let compressed = CompressedKvWriterU16::new_with_dictionary(document, dictionary);
+            // safety the compressed document is entirely owned
+            unsafe {
+                iter.put_current_with_options::<CompressedObkvCodec>(
+                    PutFlags::empty(),
+                    &docid,
+                    &compressed,
+                )?;
+            }
+        }
+
+        Ok(())
+    }
 }

 /// Run the word prefix docids update operation.
--- a/milli/src/update/index_documents/typed_chunk.rs
+++ b/milli/src/update/index_documents/typed_chunk.rs
@ -19,6 +19,7 @@ use super::helpers::{
 use super::MergeFn;
 use crate::external_documents_ids::{DocumentOperation, DocumentOperationKind};
 use crate::facet::FacetType;
+use crate::heed_codec::CompressedKvWriterU16;
 use crate::index::db_name::DOCUMENTS;
 use crate::index::IndexEmbeddingConfig;
 use crate::proximity::MAX_DISTANCE;
@ -162,6 +163,7 @@ pub(crate) fn write_typed_chunk_into_index(
                .into_iter()
                .map(|IndexEmbeddingConfig { name, .. }| name)
                .collect();
+            let dictionary = index.document_compression_dictionary(wtxn)?.map(Vec::from);
            let mut vectors_buffer = Vec::new();
            while let Some((key, reader)) = iter.next()? {
                let mut writer: KvWriter<_, FieldId> = KvWriter::memory();
@ -211,7 +213,17 @@ pub(crate) fn write_typed_chunk_into_index(
                let db = index.documents.remap_data_type::<Bytes>();

                if !writer.is_empty() {
-                    db.put(wtxn, &docid, &writer.into_inner().unwrap())?;
+                    let uncompressed_document_bytes = writer.into_inner().unwrap();
+                    match dictionary.as_ref() {
+                        Some(dictionary) => {
+                            let compressed = CompressedKvWriterU16::new_with_dictionary(
+                                &uncompressed_document_bytes,
+                                dictionary,
+                            );
+                            db.put(wtxn, &docid, compressed.as_bytes())?
+                        }
+                        None => db.put(wtxn, &docid, &uncompressed_document_bytes)?,
+                    }
                    operations.push(DocumentOperation {
                        external_id: external_id.to_string(),
                        internal_id: docid,