Fix the usage of compressed documents

This commit is contained in:
Clément Renault 2024-12-17 16:25:53 +01:00
parent b7ae720a7e
commit c1dd489adc
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
3 changed files with 17 additions and 22 deletions

View File

@ -891,10 +891,10 @@ impl IndexScheduler {
let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents); let (atomic, update_document_progress) = AtomicDocumentStep::new(nb_documents);
progress.update_progress(update_document_progress); progress.update_progress(update_document_progress);
let documents = index let documents = index
.all_documents(&rtxn) .all_compressed_documents(&rtxn)
.map_err(|e| Error::from_milli(e, Some(uid.to_string())))?; .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
// 3.1. Dump the documents // 3.1. Dump the documents
for ret in index.all_compressed_documents(&rtxn)? { for ret in documents {
if self.must_stop_processing.get() { if self.must_stop_processing.get() {
return Err(Error::AbortedTask); return Err(Error::AbortedTask);
} }

View File

@ -3129,10 +3129,18 @@ mod tests {
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
let field_ids_map = index.fields_ids_map(&rtxn).unwrap(); let field_ids_map = index.fields_ids_map(&rtxn).unwrap();
let field_ids = field_ids_map.ids().collect::<Vec<_>>(); let field_ids = field_ids_map.ids().collect::<Vec<_>>();
let dictionary = index.document_decompression_dictionary(&rtxn).unwrap();
let mut buffer = Vec::new();
let documents = index let documents = index
.all_documents(&rtxn) .all_compressed_documents(&rtxn)
.unwrap() .unwrap()
.map(|ret| obkv_to_json(&field_ids, &field_ids_map, ret.unwrap().1).unwrap()) .map(|ret| {
let (_docid, compressed_doc) = ret.unwrap();
let doc = compressed_doc
.decompress_with_optional_dictionary(&mut buffer, dictionary.as_ref())
.unwrap();
obkv_to_json(&field_ids, &field_ids_map, doc).unwrap()
})
.collect::<Vec<_>>(); .collect::<Vec<_>>();
snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents_remaining_should_only_be_bork"); snapshot!(serde_json::to_string_pretty(&documents).unwrap(), name: "documents_remaining_should_only_be_bork");
} }

View File

@ -1293,26 +1293,13 @@ impl<'a> HitMaker<'a> {
} }
pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> { pub fn make_hit(&self, id: u32, score: &[ScoreDetails]) -> milli::Result<SearchHit> {
let (_, obkv) = let mut buffer = Vec::new();
self.index.iter_documents(self.rtxn, std::iter::once(id))?.next().unwrap()?; let dict = self.index.document_decompression_dictionary(self.rtxn)?;
let compressed = self.index.compressed_document(self.rtxn, id)?.unwrap();
let doc = compressed.decompress_with_optional_dictionary(&mut buffer, dict.as_ref())?;
// let mut formatter_builder = MatcherBuilder::new(matching_words, tokenizer_builder.build());
// formatter_builder.crop_marker(format.crop_marker);
// formatter_builder.highlight_prefix(format.highlight_pre_tag);
// formatter_builder.highlight_suffix(format.highlight_post_tag);
// let decompression_dictionary = index.document_decompression_dictionary(rtxn)?;
// let mut buffer = Vec::new();
// let mut documents = Vec::new();
// let embedding_configs = index.embedding_configs(rtxn)?;
// let documents_iter = index.compressed_documents(rtxn, documents_ids)?;
// for ((id, compressed), score) in documents_iter.into_iter().zip(document_scores.into_iter()) {
// let obkv = compressed
// .decompress_with_optional_dictionary(&mut buffer, decompression_dictionary.as_ref())
// // TODO use a better error?
// .map_err(|e| MeilisearchHttpError::HeedError(e.into()))?;
// First generate a document with all the displayed fields // First generate a document with all the displayed fields
let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, obkv)?; let displayed_document = make_document(&self.displayed_ids, &self.fields_ids_map, doc)?;
let add_vectors_fid = let add_vectors_fid =
self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve); self.vectors_fid.filter(|_fid| self.retrieve_vectors == RetrieveVectors::Retrieve);