Fix the usage of compressed documents

2025-02-10 12:33:29 +01:00 · 2024-12-17 16:25:53 +01:00 · 2024-12-17 16:25:53 +01:00 · c1dd489adc
commit c1dd489adc
parent b7ae720a7e
3 changed files with 17 additions and 22 deletions
--- a/crates/index-scheduler/src/batch.rs
+++ b/crates/index-scheduler/src/batch.rs
@ -891,10 +891,10 @@ impl IndexScheduler {
                    let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents);
                    progress.update_progress(update_document_progress);
                    let documents = index
-                        .all_documents(&rtxn)
+                        .all_compressed_documents(&rtxn)
                        .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
                    // 3.1. Dump the documents
-                    for ret in index.all_compressed_documents(&rtxn)? {
+                    for ret in documents {
                        if self.must_stop_processing.get() {
                            return Err(Error::AbortedTask);
                        }
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@ -3129,10 +3129,18 @@ mod tests {
        let rtxn = index.read_txn().unwrap();
        let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
        let field_ids = field_ids_map.ids().collect::<Vec<_>>();
+        let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
+        let mut buffer = Vec::new();
        let documents = index
-            .all_documents(&rtxn)
+            .all_compressed_documents(&rtxn)
            .unwrap()
-            .map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap())
+            .map(|ret| {
+                let (_docid, compressed_doc) = ret.unwrap();
+                let doc = compressed_doc
+                    .decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
+                    .unwrap();
+                obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
+            })
            .collect::<Vec<_>>();
        snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents_remaining_should_only_be_bork");
    }
--- a/crates/meilisearch/src/search/mod.rs
+++ b/crates/meilisearch/src/search/mod.rs
@ -1293,26 +1293,13 @@ impl<'a> HitMaker<'a> {
    }

    pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
-        let (_, obkv) =
-            self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?;
+        let mut buffer = Vec::new();
+        let dict = self.index.document_decompression_dictionary(self.rtxn)?;
+        let compressed = self.index.compressed_document(self.rtxn, id)?.unwrap();
+        let doc = compressed.decompress_with_optional_dictionary(&mut buffer, dict.as_ref())?;

-    // let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build());
-    // formatter_builder.crop_marker(format.crop_marker);
-    // formatter_builder.highlight_prefix(format.highlight_pre_tag);
-    // formatter_builder.highlight_suffix(format.highlight_post_tag);
-    // let decompression_dictionary = index.document_decompression_dictionary(rtxn)?;
-    // let mut buffer = Vec::new();
-    // let mut documents = Vec::new();
-    // let embedding_configs = index.embedding_configs(rtxn)?;
-    // let documents_iter = index.compressed_documents(rtxn, documents_ids)?;
-    // for ((id, compressed), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
-    //     let obkv = compressed
-    //         .decompress_with_optional_dictionary(&mut buffer, decompression_dictionary.as_ref())
-    //         // TODO use a better error?
-    //         .map_err(|e| MeilisearchHttpError::HeedError(e.into()))?;
        // First generate a document with all the displayed fields
-        let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?;
-
+        let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, doc)?;
        let add_vectors_fid =
            self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve);