From 5f9f82757dbebec7087cd56b2e624e372c3bbb4f Mon Sep 17 00:00:00 2001 From: ad hoc Date: Wed, 23 Mar 2022 14:48:15 +0100 Subject: [PATCH] refactor spawn_extraction_task --- .../src/update/index_documents/extract/mod.rs | 26 +++---- .../index_documents/helpers/grenad_helpers.rs | 69 ++++++++++++++----- .../src/update/index_documents/helpers/mod.rs | 4 +- 3 files changed, 69 insertions(+), 30 deletions(-) diff --git a/milli/src/update/index_documents/extract/mod.rs b/milli/src/update/index_documents/extract/mod.rs index 4c81b9334..100431237 100644 --- a/milli/src/update/index_documents/extract/mod.rs +++ b/milli/src/update/index_documents/extract/mod.rs @@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids use self::extract_word_position_docids::extract_word_position_docids; use super::helpers::{ as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps, - merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, + merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader, }; use super::{helpers, TypedChunk}; use crate::{FieldId, Result}; @@ -66,7 +66,7 @@ pub(crate) fn data_from_obkv_documents( (docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks), ) = result?; - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -76,7 +76,7 @@ pub(crate) fn data_from_obkv_documents( "word-pair-proximity-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents( "field-id-wordcount-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -96,7 +96,7 @@ pub(crate) fn data_from_obkv_documents( "word-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_word_positions_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -106,7 +106,7 @@ pub(crate) fn data_from_obkv_documents( "word-position-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_strings_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -116,7 +116,7 @@ pub(crate) fn data_from_obkv_documents( "field-id-facet-string-docids", ); - spawn_extraction_task( + spawn_extraction_task::<_, _, Vec>>( docid_fid_facet_numbers_chunks.clone(), indexer.clone(), lmdb_writer_sx.clone(), @@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents( /// Generated grenad chunks are merged using the merge_fn. /// The result of merged chunks is serialized as TypedChunk using the serialize_fn /// and sent into lmdb_writer_sx. -fn spawn_extraction_task( +fn spawn_extraction_task( chunks: Vec>, indexer: GrenadParameters, lmdb_writer_sx: Sender>, @@ -142,19 +142,21 @@ fn spawn_extraction_task( serialize_fn: FS, name: &'static str, ) where - FE: Fn(grenad::Reader, GrenadParameters) -> Result> + FE: Fn(grenad::Reader, GrenadParameters) -> Result + Sync + Send + 'static, - FS: Fn(grenad::Reader) -> TypedChunk + Sync + Send + 'static, + FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static, + M: MergeableReader + FromParallelIterator + Send + 'static, + M::Output: Send, { rayon::spawn(move || { - let chunks: Result> = + let chunks: Result = chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect(); rayon::spawn(move || match chunks { Ok(chunks) => { debug!("merge {} database", name); - let reader = merge_readers(chunks, merge_fn, indexer); + let reader = chunks.merge(merge_fn, &indexer); let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r))); } Err(e) => { diff --git a/milli/src/update/index_documents/helpers/grenad_helpers.rs b/milli/src/update/index_documents/helpers/grenad_helpers.rs index e0ac3a175..fc28860b2 100644 --- a/milli/src/update/index_documents/helpers/grenad_helpers.rs +++ b/milli/src/update/index_documents/helpers/grenad_helpers.rs @@ -78,25 +78,62 @@ pub unsafe fn as_cloneable_grenad( Ok(reader) } -pub fn merge_readers( - readers: Vec>, - merge_fn: MergeFn, - indexer: GrenadParameters, -) -> Result> { - let mut merger_builder = grenad::MergerBuilder::new(merge_fn); - for reader in readers { - merger_builder.push(reader.into_cursor()?); +pub trait MergeableReader +where + Self: Sized, +{ + type Output; + + fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result; +} + +impl MergeableReader for Vec> { + type Output = grenad::Reader; + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut merger = MergerBuilder::new(merge_fn); + self.into_iter().try_for_each(|r| merger.push(r))?; + merger.finish(params) + } +} + +impl MergeableReader for Vec<(grenad::Reader, grenad::Reader)> { + type Output = (grenad::Reader, grenad::Reader); + + fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result { + let mut m1 = MergerBuilder::new(merge_fn); + let mut m2 = MergerBuilder::new(merge_fn); + for (r1, r2) in self.into_iter() { + m1.push(r1)?; + m2.push(r2)?; + } + Ok((m1.finish(params)?, m2.finish(params)?)) + } +} + +struct MergerBuilder(grenad::MergerBuilder); + +impl MergerBuilder { + fn new(merge_fn: MergeFn) -> Self { + Self(grenad::MergerBuilder::new(merge_fn)) } - let merger = merger_builder.build(); - let mut writer = create_writer( - indexer.chunk_compression_type, - indexer.chunk_compression_level, - tempfile::tempfile()?, - ); - merger.write_into_stream_writer(&mut writer)?; + fn push(&mut self, reader: grenad::Reader) -> Result<()> { + self.0.push(reader.into_cursor()?); + Ok(()) + } - Ok(writer_into_reader(writer)?) + fn finish(self, params: &GrenadParameters) -> Result> { + let merger = self.0.build(); + let mut writer = create_writer( + params.chunk_compression_type, + params.chunk_compression_level, + tempfile::tempfile()?, + ); + merger.write_into_stream_writer(&mut writer)?; + + Ok(writer_into_reader(writer)?) + } } #[derive(Debug, Clone, Copy)] diff --git a/milli/src/update/index_documents/helpers/mod.rs b/milli/src/update/index_documents/helpers/mod.rs index 22c1cfd6c..f4940af1d 100644 --- a/milli/src/update/index_documents/helpers/mod.rs +++ b/milli/src/update/index_documents/helpers/mod.rs @@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto}; pub use clonable_mmap::{ClonableMmap, CursorClonableMmap}; use fst::{IntoStreamer, Streamer}; pub use grenad_helpers::{ - as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers, + as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader, - GrenadParameters, + GrenadParameters, MergeableReader, }; pub use merge_functions::{ concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,