From 1aa8ed9ef7bc02fe805e77e2feee4b81031acb05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Tue, 21 May 2024 14:53:26 +0200 Subject: [PATCH] Make the original sorter optional --- milli/src/update/index_documents/mod.rs | 22 +++++++++-- milli/src/update/index_documents/transform.rs | 38 ++++++++++++------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 4d2fac7cb..cceb25338 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -6,6 +6,7 @@ mod typed_chunk; use std::collections::{HashMap, HashSet}; use std::io::{Read, Seek}; +use std::iter; use std::num::NonZeroU32; use std::result::Result as StdResult; use std::sync::Arc; @@ -373,8 +374,11 @@ where } }; - let original_documents = grenad::Reader::new(original_documents)?; let flattened_documents = grenad::Reader::new(flattened_documents)?; + let original_documents = match original_documents { + Some(original_documents) => Some(grenad::Reader::new(original_documents)?), + None => None, + }; let max_positions_per_attributes = self.indexer_config.max_positions_per_attributes; @@ -393,11 +397,21 @@ where pool.install(|| { rayon::spawn(move || { let child_span = tracing::trace_span!(target: "indexing::details", parent: ¤t_span, "extract_and_send_grenad_chunks"); - let _enter = child_span.enter(); - puffin::profile_scope!("extract_and_send_grenad_chunks"); + let _enter = child_span.enter(); + puffin::profile_scope!("extract_and_send_grenad_chunks"); // split obkv file into several chunks let original_chunk_iter = - grenad_obkv_into_chunks(original_documents, pool_params, documents_chunk_size); + match original_documents { + Some(original_documents) => { + grenad_obkv_into_chunks( + original_documents, + pool_params, + documents_chunk_size + ) + .map(either::Either::Left) + }, + None => Ok(either::Right(iter::empty())), + }; // split obkv file into several chunks let flattened_chunk_iter = diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 8a3463e6f..f7e3d79fd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -33,7 +33,7 @@ pub struct TransformOutput { pub settings_diff: InnerIndexSettingsDiff, pub field_distribution: FieldDistribution, pub documents_count: usize, - pub original_documents: File, + pub original_documents: Option, pub flattened_documents: File, } @@ -822,7 +822,9 @@ impl<'a, 'i> Transform<'a, 'i> { settings_diff, field_distribution, documents_count: self.documents_count, - original_documents: original_documents.into_inner().map_err(|err| err.into_error())?, + original_documents: Some( + original_documents.into_inner().map_err(|err| err.into_error())?, + ), flattened_documents: flattened_documents .into_inner() .map_err(|err| err.into_error())?, @@ -891,14 +893,18 @@ impl<'a, 'i> Transform<'a, 'i> { let documents_count = documents_ids.len() as usize; // We initialize the sorter with the user indexing settings. - let mut original_sorter = create_sorter( - grenad::SortAlgorithm::Stable, - keep_first, - self.indexer_settings.chunk_compression_type, - self.indexer_settings.chunk_compression_level, - self.indexer_settings.max_nb_chunks, - self.indexer_settings.max_memory.map(|mem| mem / 2), - ); + let mut original_sorter = if settings_diff.reindex_vectors() { + Some(create_sorter( + grenad::SortAlgorithm::Stable, + keep_first, + self.indexer_settings.chunk_compression_type, + self.indexer_settings.chunk_compression_level, + self.indexer_settings.max_nb_chunks, + self.indexer_settings.max_memory.map(|mem| mem / 2), + )) + } else { + None + }; // We initialize the sorter with the user indexing settings. let mut flattened_sorter = create_sorter( @@ -929,7 +935,9 @@ impl<'a, 'i> Transform<'a, 'i> { document_sorter_key_buffer.clear(); document_sorter_key_buffer.extend_from_slice(&docid.to_be_bytes()); document_sorter_key_buffer.extend_from_slice(external_id.as_bytes()); - original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + if let Some(original_sorter) = original_sorter.as_mut() { + original_sorter.insert(&document_sorter_key_buffer, &original_obkv_buffer)?; + } flattened_sorter.insert(docid.to_be_bytes(), &flattened_obkv_buffer)?; } @@ -941,16 +949,18 @@ impl<'a, 'i> Transform<'a, 'i> { }; // Once we have written all the documents, we merge everything into a Reader. - let original_documents = sorter_into_reader(original_sorter, grenad_params)?; - let flattened_documents = sorter_into_reader(flattened_sorter, grenad_params)?; + let original_documents = match original_sorter { + Some(original_sorter) => Some(sorter_into_reader(original_sorter, grenad_params)?), + None => None, + }; Ok(TransformOutput { primary_key, field_distribution, settings_diff, documents_count, - original_documents: original_documents.into_inner().into_inner(), + original_documents: original_documents.map(|od| od.into_inner().into_inner()), flattened_documents: flattened_documents.into_inner().into_inner(), }) }