From c1dd489adc4a6ef24e0c13ebb7530dc59fff5603 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 17 Dec 2024 16:25:53 +0100 Subject: [PATCH] Fix the usage of compressed documents --- crates/index-scheduler/src/batch.rs | 4 ++-- crates/index-scheduler/src/lib.rs | 12 ++++++++++-- crates/meilisearch/src/search/mod.rs | 23 +++++------------------ 3 files changed, 17 insertions(+), 22 deletions(-) diff --git a/crates/index-scheduler/src/batch.rs b/crates/index-scheduler/src/batch.rs index 3cdd2eba9..a0b984f71 100644 --- a/crates/index-scheduler/src/batch.rs +++ b/crates/index-scheduler/src/batch.rs @@ -891,10 +891,10 @@ impl IndexScheduler { let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents); progress.update_progress(update_document_progress); let documents = index - .all_documents(&rtxn) + .all_compressed_documents(&rtxn) .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; // 3.1. Dump the documents - for ret in index.all_compressed_documents(&rtxn)? { + for ret in documents { if self.must_stop_processing.get() { return Err(Error::AbortedTask); } diff --git a/crates/index-scheduler/src/lib.rs b/crates/index-scheduler/src/lib.rs index 7e20667dc..e7d085aad 100644 --- a/crates/index-scheduler/src/lib.rs +++ b/crates/index-scheduler/src/lib.rs @@ -3129,10 +3129,18 @@ mod tests { let rtxn = index.read_txn().unwrap(); let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); let field_ids = field_ids_map.ids().collect::>(); + let dictionary = index.document_decompression_dictionary(&rtxn).unwrap(); + let mut buffer = Vec::new(); let documents = index - .all_documents(&rtxn) + .all_compressed_documents(&rtxn) .unwrap() - .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) + .map(|ret| { + let (_docid, compressed_doc) = ret.unwrap(); + let doc = compressed_doc + .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref()) + .unwrap(); + obkv_to_json(&field_ids, &field_ids_map, doc).unwrap() + }) .collect::>(); snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents_remaining_should_only_be_bork"); } diff --git a/crates/meilisearch/src/search/mod.rs b/crates/meilisearch/src/search/mod.rs index 9ee28a22a..165611b6c 100644 --- a/crates/meilisearch/src/search/mod.rs +++ b/crates/meilisearch/src/search/mod.rs @@ -1293,26 +1293,13 @@ impl<'a> HitMaker<'a> { } pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result { - let (_, obkv) = - self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; + let mut buffer = Vec::new(); + let dict = self.index.document_decompression_dictionary(self.rtxn)?; + let compressed = self.index.compressed_document(self.rtxn, id)?.unwrap(); + let doc = compressed.decompress_with_optional_dictionary(&mut buffer, dict.as_ref())?; - // let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build()); - // formatter_builder.crop_marker(format.crop_marker); - // formatter_builder.highlight_prefix(format.highlight_pre_tag); - // formatter_builder.highlight_suffix(format.highlight_post_tag); - // let decompression_dictionary = index.document_decompression_dictionary(rtxn)?; - // let mut buffer = Vec::new(); - // let mut documents = Vec::new(); - // let embedding_configs = index.embedding_configs(rtxn)?; - // let documents_iter = index.compressed_documents(rtxn, documents_ids)?; - // for ((id, compressed), score) in documents_iter.into_iter().zip(document_scores.into_iter()) { - // let obkv = compressed - // .decompress_with_optional_dictionary(&mut buffer, decompression_dictionary.as_ref()) - // // TODO use a better error? - // .map_err(|e| MeilisearchHttpError::HeedError(e.into()))?; // First generate a document with all the displayed fields - let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?; - + let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, doc)?; let add_vectors_fid = self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve);