From 7d75988a538072e083a52ca0e337c24c2ac887c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 17 Dec 2024 16:56:54 +0100 Subject: [PATCH] Fetch the compression dictionary only once to decompress documents --- crates/milli/src/update/new/document.rs | 15 ++++++++-- .../milli/src/update/new/document_change.rs | 24 +++++++++++----- .../src/update/new/extract/documents/mod.rs | 4 +++ .../new/extract/faceted/extract_facets.rs | 25 +++++++++++++++-- .../milli/src/update/new/extract/geo/mod.rs | 17 +++++++++-- .../extract/searchable/extract_word_docids.rs | 25 +++++++++++++++-- .../extract_word_pair_proximity_docids.rs | 28 +++++++++++++++---- .../src/update/new/extract/vectors/mod.rs | 5 ++++ .../milli/src/update/new/vector_document.rs | 21 ++++++++++++-- 9 files changed, 138 insertions(+), 26 deletions(-) diff --git a/crates/milli/src/update/new/document.rs b/crates/milli/src/update/new/document.rs index 5ac527231..e01d39b54 100644 --- a/crates/milli/src/update/new/document.rs +++ b/crates/milli/src/update/new/document.rs @@ -5,6 +5,7 @@ use bumparaw_collections::RawMap; use heed::RoTxn; use rustc_hash::FxBuildHasher; use serde_json::value::RawValue; +use zstd::dict::DecoderDictionary; use super::vector_document::VectorDocument; use super::{KvReaderFieldId, KvWriterFieldId}; @@ -130,12 +131,12 @@ impl<'t, Mapper: FieldIdMapper> DocumentFromDb<'t, Mapper> { rtxn: &'t RoTxn, index: &'t Index, db_fields_ids_map: &'t Mapper, + db_document_decompression_dictionary: Option<&DecoderDictionary<'static>>, doc_alloc: &'t Bump, ) -> Result> { match index.compressed_document(rtxn, docid)? { Some(compressed) => { - /// TODO maybe give the dictionary as a parameter - let content = match index.document_decompression_dictionary(rtxn)? { + let content = match db_document_decompression_dictionary { Some(dictionary) => compressed.decompress_into_bump(doc_alloc, &dictionary)?, None => compressed.as_non_compressed(), }; @@ -206,10 +207,18 @@ impl<'a, 'doc, 't, Mapper: FieldIdMapper> MergedDocument<'a, 'doc, 't, Mapper> { rtxn: &'t RoTxn, index: &'t Index, db_fields_ids_map: &'t Mapper, + db_document_decompression_dictionary: Option<&'t DecoderDictionary<'static>>, doc_alloc: &'t Bump, new_doc: DocumentFromVersions<'a, 'doc>, ) -> Result { - let db = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map, doc_alloc)?; + let db = DocumentFromDb::new( + docid, + rtxn, + index, + db_fields_ids_map, + db_document_decompression_dictionary, + doc_alloc, + )?; Ok(Self { new_doc, db }) } diff --git a/crates/milli/src/update/new/document_change.rs b/crates/milli/src/update/new/document_change.rs index c209ae67e..fb37053e5 100644 --- a/crates/milli/src/update/new/document_change.rs +++ b/crates/milli/src/update/new/document_change.rs @@ -1,5 +1,6 @@ use bumpalo::Bump; use heed::RoTxn; +use zstd::dict::DecoderDictionary; use super::document::{ Document as _, DocumentFromDb, DocumentFromVersions, MergedDocument, Versions, @@ -72,9 +73,10 @@ impl<'doc> Deletion<'doc> { rtxn: &'a RoTxn, index: &'a Index, mapper: &'a Mapper, + dictionary: Option<&'a DecoderDictionary<'static>>, doc_alloc: &'a Bump, ) -> Result> { - Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper, doc_alloc)?.ok_or( + Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper, dictionary, doc_alloc)?.ok_or( crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, )?) } @@ -128,9 +130,10 @@ impl<'doc> Update<'doc> { rtxn: &'a RoTxn, index: &'a Index, mapper: &'a Mapper, + dictionary: Option<&'a DecoderDictionary<'static>>, doc_alloc: &'a Bump, ) -> Result> { - Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper, doc_alloc)?.ok_or( + Ok(DocumentFromDb::new(self.docid, rtxn, index, mapper, dictionary, doc_alloc)?.ok_or( crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, )?) } @@ -140,11 +143,13 @@ impl<'doc> Update<'doc> { rtxn: &'a RoTxn, index: &'a Index, mapper: &'a Mapper, + dictionary: Option<&'a DecoderDictionary<'static>>, doc_alloc: &'a Bump, ) -> Result> { - Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, doc_alloc)?.ok_or( - crate::error::UserError::UnknownInternalDocumentId { document_id: self.docid }, - )?) + Ok(VectorDocumentFromDb::new(self.docid, index, rtxn, mapper, dictionary, doc_alloc)? + .ok_or(crate::error::UserError::UnknownInternalDocumentId { + document_id: self.docid, + })?) } pub fn updated(&self) -> DocumentFromVersions<'_, 'doc> { @@ -156,6 +161,7 @@ impl<'doc> Update<'doc> { rtxn: &'t RoTxn, index: &'t Index, mapper: &'t Mapper, + dictionary: Option<&'t DecoderDictionary<'static>>, doc_alloc: &'t Bump, ) -> Result> { if self.has_deletion { @@ -166,6 +172,7 @@ impl<'doc> Update<'doc> { rtxn, index, mapper, + dictionary, doc_alloc, DocumentFromVersions::new(&self.new), ) @@ -182,6 +189,7 @@ impl<'doc> Update<'doc> { rtxn: &'t RoTxn, index: &'t Index, mapper: &'t Mapper, + dictionary: Option<&'t DecoderDictionary<'static>>, doc_alloc: &'t Bump, ) -> Result { let mut changed = false; @@ -198,7 +206,7 @@ impl<'doc> Update<'doc> { updated_selected_field_count += 1; let current = match cached_current { Some(current) => current, - None => self.current(rtxn, index, mapper, doc_alloc)?, + None => self.current(rtxn, index, mapper, dictionary, doc_alloc)?, }; let current_value = current.top_level_field(key)?; let Some(current_value) = current_value else { @@ -228,7 +236,7 @@ impl<'doc> Update<'doc> { let has_deleted_fields = { let current = match cached_current { Some(current) => current, - None => self.current(rtxn, index, mapper, doc_alloc)?, + None => self.current(rtxn, index, mapper, dictionary, doc_alloc)?, }; let mut current_selected_field_count = 0; @@ -260,6 +268,7 @@ impl<'doc> Update<'doc> { rtxn: &'doc RoTxn, index: &'doc Index, mapper: &'doc Mapper, + dictionary: Option<&'doc DecoderDictionary<'static>>, doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, ) -> Result>> { @@ -277,6 +286,7 @@ impl<'doc> Update<'doc> { index, rtxn, mapper, + dictionary, &self.new, doc_alloc, embedders, diff --git a/crates/milli/src/update/new/extract/documents/mod.rs b/crates/milli/src/update/new/extract/documents/mod.rs index 61fe83662..5c1abcb89 100644 --- a/crates/milli/src/update/new/extract/documents/mod.rs +++ b/crates/milli/src/update/new/extract/documents/mod.rs @@ -82,6 +82,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> { &context.rtxn, context.index, &context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?; let geo_iter = @@ -103,6 +104,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> { &context.rtxn, context.index, &context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?; let geo_iter = @@ -131,12 +133,14 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for DocumentsExtractor<'a, 'b> { &context.rtxn, context.index, &context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?; let vector_content = update.merged_vectors( &context.rtxn, context.index, &context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, self.embedders, )?; diff --git a/crates/milli/src/update/new/extract/faceted/extract_facets.rs b/crates/milli/src/update/new/extract/faceted/extract_facets.rs index bbb53955f..03c73dead 100644 --- a/crates/milli/src/update/new/extract/faceted/extract_facets.rs +++ b/crates/milli/src/update/new/extract/faceted/extract_facets.rs @@ -79,7 +79,13 @@ impl FacetedDocidsExtractor { let res = match document_change { DocumentChange::Deletion(inner) => extract_document_facets( attributes_to_extract, - inner.current(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?, + inner.current( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?, inner.external_document_id(), new_fields_ids_map.deref_mut(), &mut |fid, depth, value| { @@ -102,6 +108,7 @@ impl FacetedDocidsExtractor { rtxn, index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )? { return Ok(()); @@ -109,7 +116,13 @@ impl FacetedDocidsExtractor { extract_document_facets( attributes_to_extract, - inner.current(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?, + inner.current( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?, inner.external_document_id(), new_fields_ids_map.deref_mut(), &mut |fid, depth, value| { @@ -129,7 +142,13 @@ impl FacetedDocidsExtractor { extract_document_facets( attributes_to_extract, - inner.merged(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?, + inner.merged( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?, inner.external_document_id(), new_fields_ids_map.deref_mut(), &mut |fid, depth, value| { diff --git a/crates/milli/src/update/new/extract/geo/mod.rs b/crates/milli/src/update/new/extract/geo/mod.rs index 4ecb78ba0..a4674c08d 100644 --- a/crates/milli/src/update/new/extract/geo/mod.rs +++ b/crates/milli/src/update/new/extract/geo/mod.rs @@ -158,6 +158,7 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { let index = context.index; let max_memory = self.grenad_parameters.max_memory_by_thread(); let db_fields_ids_map = context.db_fields_ids_map; + let db_document_decompression_dictionary = context.db_document_decompression_dictionary; let doc_alloc = &context.doc_alloc; let mut data_ref = context.data.borrow_mut_or_yield(); @@ -174,7 +175,13 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { DocumentChange::Deletion(deletion) => { let docid = deletion.docid(); let external_id = deletion.external_document_id(); - let current = deletion.current(rtxn, index, db_fields_ids_map, doc_alloc)?; + let current = deletion.current( + rtxn, + index, + db_fields_ids_map, + db_document_decompression_dictionary, + doc_alloc, + )?; let current_geo = current .geo_field()? .map(|geo| extract_geo_coordinates(external_id, geo)) @@ -189,7 +196,13 @@ impl<'extractor> Extractor<'extractor> for GeoExtractor { } } DocumentChange::Update(update) => { - let current = update.current(rtxn, index, db_fields_ids_map, doc_alloc)?; + let current = update.current( + rtxn, + index, + db_fields_ids_map, + db_document_decompression_dictionary, + doc_alloc, + )?; let external_id = update.external_document_id(); let docid = update.docid(); diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs index c8a11923b..946bdcd56 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -339,7 +339,13 @@ impl WordDocidsExtractors { ) }; document_tokenizer.tokenize_document( - inner.current(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?, + inner.current( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?, new_fields_ids_map, &mut token_fn, )?; @@ -350,6 +356,7 @@ impl WordDocidsExtractors { &context.rtxn, context.index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )? { return Ok(()); @@ -366,7 +373,13 @@ impl WordDocidsExtractors { ) }; document_tokenizer.tokenize_document( - inner.current(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?, + inner.current( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?, new_fields_ids_map, &mut token_fn, )?; @@ -382,7 +395,13 @@ impl WordDocidsExtractors { ) }; document_tokenizer.tokenize_document( - inner.merged(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?, + inner.merged( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?, new_fields_ids_map, &mut token_fn, )?; diff --git a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index 422f11037..cd34287ca 100644 --- a/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/crates/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -58,8 +58,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { let docid = document_change.docid(); match document_change { DocumentChange::Deletion(inner) => { - let document = - inner.current(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?; + let document = inner.current( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?; process_document_tokens( document, document_tokenizer, @@ -76,13 +81,19 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { rtxn, index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )? { return Ok(()); } - let document = - inner.current(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?; + let document = inner.current( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?; process_document_tokens( document, document_tokenizer, @@ -92,8 +103,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { del_word_pair_proximity.push(((w1, w2), prox)); }, )?; - let document = - inner.merged(rtxn, index, context.db_fields_ids_map, &context.doc_alloc)?; + let document = inner.merged( + rtxn, + index, + context.db_fields_ids_map, + context.db_document_decompression_dictionary, + &context.doc_alloc, + )?; process_document_tokens( document, document_tokenizer, diff --git a/crates/milli/src/update/new/extract/vectors/mod.rs b/crates/milli/src/update/new/extract/vectors/mod.rs index bebe23c90..69922d3f0 100644 --- a/crates/milli/src/update/new/extract/vectors/mod.rs +++ b/crates/milli/src/update/new/extract/vectors/mod.rs @@ -97,6 +97,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { &context.rtxn, context.index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?; let new_vectors = update.updated_vectors(&context.doc_alloc, self.embedders)?; @@ -135,6 +136,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { &context.rtxn, context.index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?, context.new_fields_ids_map, @@ -146,6 +148,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { &context.rtxn, context.index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?, context.new_fields_ids_map, @@ -167,6 +170,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { &context.rtxn, context.index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?, context.new_fields_ids_map, @@ -178,6 +182,7 @@ impl<'a, 'b, 'extractor> Extractor<'extractor> for EmbeddingExtractor<'a, 'b> { &context.rtxn, context.index, context.db_fields_ids_map, + context.db_document_decompression_dictionary, &context.doc_alloc, )?, context.new_fields_ids_map, diff --git a/crates/milli/src/update/new/vector_document.rs b/crates/milli/src/update/new/vector_document.rs index 498a3f2a2..74dc89070 100644 --- a/crates/milli/src/update/new/vector_document.rs +++ b/crates/milli/src/update/new/vector_document.rs @@ -7,6 +7,7 @@ use heed::RoTxn; use rustc_hash::FxBuildHasher; use serde::Serialize; use serde_json::value::RawValue; +use zstd::dict::DecoderDictionary; use super::document::{Document, DocumentFromDb, DocumentFromVersions, Versions}; use super::indexer::de::DeserrRawValue; @@ -96,9 +97,17 @@ impl<'t> VectorDocumentFromDb<'t> { index: &'t Index, rtxn: &'t RoTxn, db_fields_ids_map: &'t Mapper, + db_document_decompression_dictionary: Option<&'t DecoderDictionary<'static>>, doc_alloc: &'t Bump, ) -> Result> { - let Some(document) = DocumentFromDb::new(docid, rtxn, index, db_fields_ids_map, doc_alloc)? + let Some(document) = DocumentFromDb::new( + docid, + rtxn, + index, + db_fields_ids_map, + db_document_decompression_dictionary, + doc_alloc, + )? else { return Ok(None); }; @@ -283,11 +292,19 @@ impl<'doc> MergedVectorDocument<'doc> { index: &'doc Index, rtxn: &'doc RoTxn, db_fields_ids_map: &'doc Mapper, + db_document_decompression_dictionary: Option<&'doc DecoderDictionary<'static>>, versions: &Versions<'doc>, doc_alloc: &'doc Bump, embedders: &'doc EmbeddingConfigs, ) -> Result> { - let db = VectorDocumentFromDb::new(docid, index, rtxn, db_fields_ids_map, doc_alloc)?; + let db = VectorDocumentFromDb::new( + docid, + index, + rtxn, + db_fields_ids_map, + db_document_decompression_dictionary, + doc_alloc, + )?; let new_doc = VectorDocumentFromVersions::new(external_document_id, versions, doc_alloc, embedders)?; Ok(if db.is_none() && new_doc.is_none() { None } else { Some(Self { new_doc, db }) })