mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-30 00:34:26 +01:00
refactor spawn_extraction_task
This commit is contained in:
parent
f82d4b36eb
commit
5f9f82757d
@ -26,7 +26,7 @@ use self::extract_word_pair_proximity_docids::extract_word_pair_proximity_docids
|
|||||||
use self::extract_word_position_docids::extract_word_position_docids;
|
use self::extract_word_position_docids::extract_word_position_docids;
|
||||||
use super::helpers::{
|
use super::helpers::{
|
||||||
as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
|
as_cloneable_grenad, keep_first_prefix_value_merge_roaring_bitmaps, merge_cbo_roaring_bitmaps,
|
||||||
merge_readers, merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn,
|
merge_roaring_bitmaps, CursorClonableMmap, GrenadParameters, MergeFn, MergeableReader,
|
||||||
};
|
};
|
||||||
use super::{helpers, TypedChunk};
|
use super::{helpers, TypedChunk};
|
||||||
use crate::{FieldId, Result};
|
use crate::{FieldId, Result};
|
||||||
@ -66,7 +66,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
|
(docid_fid_facet_numbers_chunks, docid_fid_facet_strings_chunks),
|
||||||
) = result?;
|
) = result?;
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -76,7 +76,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-pair-proximity-docids",
|
"word-pair-proximity-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -86,7 +86,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-wordcount-docids",
|
"field-id-wordcount-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -96,7 +96,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-docids",
|
"word-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_word_positions_chunks.clone(),
|
docid_word_positions_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -106,7 +106,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"word-position-docids",
|
"word-position-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_strings_chunks.clone(),
|
docid_fid_facet_strings_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -116,7 +116,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
"field-id-facet-string-docids",
|
"field-id-facet-string-docids",
|
||||||
);
|
);
|
||||||
|
|
||||||
spawn_extraction_task(
|
spawn_extraction_task::<_, _, Vec<grenad::Reader<File>>>(
|
||||||
docid_fid_facet_numbers_chunks.clone(),
|
docid_fid_facet_numbers_chunks.clone(),
|
||||||
indexer.clone(),
|
indexer.clone(),
|
||||||
lmdb_writer_sx.clone(),
|
lmdb_writer_sx.clone(),
|
||||||
@ -133,7 +133,7 @@ pub(crate) fn data_from_obkv_documents(
|
|||||||
/// Generated grenad chunks are merged using the merge_fn.
|
/// Generated grenad chunks are merged using the merge_fn.
|
||||||
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
/// The result of merged chunks is serialized as TypedChunk using the serialize_fn
|
||||||
/// and sent into lmdb_writer_sx.
|
/// and sent into lmdb_writer_sx.
|
||||||
fn spawn_extraction_task<FE, FS>(
|
fn spawn_extraction_task<FE, FS, M>(
|
||||||
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
chunks: Vec<grenad::Reader<CursorClonableMmap>>,
|
||||||
indexer: GrenadParameters,
|
indexer: GrenadParameters,
|
||||||
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
lmdb_writer_sx: Sender<Result<TypedChunk>>,
|
||||||
@ -142,19 +142,21 @@ fn spawn_extraction_task<FE, FS>(
|
|||||||
serialize_fn: FS,
|
serialize_fn: FS,
|
||||||
name: &'static str,
|
name: &'static str,
|
||||||
) where
|
) where
|
||||||
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<grenad::Reader<File>>
|
FE: Fn(grenad::Reader<CursorClonableMmap>, GrenadParameters) -> Result<M::Output>
|
||||||
+ Sync
|
+ Sync
|
||||||
+ Send
|
+ Send
|
||||||
+ 'static,
|
+ 'static,
|
||||||
FS: Fn(grenad::Reader<File>) -> TypedChunk + Sync + Send + 'static,
|
FS: Fn(M::Output) -> TypedChunk + Sync + Send + 'static,
|
||||||
|
M: MergeableReader + FromParallelIterator<M::Output> + Send + 'static,
|
||||||
|
M::Output: Send,
|
||||||
{
|
{
|
||||||
rayon::spawn(move || {
|
rayon::spawn(move || {
|
||||||
let chunks: Result<Vec<_>> =
|
let chunks: Result<M> =
|
||||||
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect();
|
chunks.into_par_iter().map(|chunk| extract_fn(chunk, indexer.clone())).collect();
|
||||||
rayon::spawn(move || match chunks {
|
rayon::spawn(move || match chunks {
|
||||||
Ok(chunks) => {
|
Ok(chunks) => {
|
||||||
debug!("merge {} database", name);
|
debug!("merge {} database", name);
|
||||||
let reader = merge_readers(chunks, merge_fn, indexer);
|
let reader = chunks.merge(merge_fn, &indexer);
|
||||||
let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r)));
|
let _ = lmdb_writer_sx.send(reader.map(|r| serialize_fn(r)));
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
@ -78,25 +78,62 @@ pub unsafe fn as_cloneable_grenad(
|
|||||||
Ok(reader)
|
Ok(reader)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn merge_readers<R: io::Read + io::Seek>(
|
pub trait MergeableReader
|
||||||
readers: Vec<grenad::Reader<R>>,
|
where
|
||||||
merge_fn: MergeFn,
|
Self: Sized,
|
||||||
indexer: GrenadParameters,
|
{
|
||||||
) -> Result<grenad::Reader<File>> {
|
type Output;
|
||||||
let mut merger_builder = grenad::MergerBuilder::new(merge_fn);
|
|
||||||
for reader in readers {
|
fn merge(self, merge_fn: MergeFn, indexer: &GrenadParameters) -> Result<Self::Output>;
|
||||||
merger_builder.push(reader.into_cursor()?);
|
}
|
||||||
|
|
||||||
|
impl MergeableReader for Vec<grenad::Reader<File>> {
|
||||||
|
type Output = grenad::Reader<File>;
|
||||||
|
|
||||||
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
|
let mut merger = MergerBuilder::new(merge_fn);
|
||||||
|
self.into_iter().try_for_each(|r| merger.push(r))?;
|
||||||
|
merger.finish(params)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MergeableReader for Vec<(grenad::Reader<File>, grenad::Reader<File>)> {
|
||||||
|
type Output = (grenad::Reader<File>, grenad::Reader<File>);
|
||||||
|
|
||||||
|
fn merge(self, merge_fn: MergeFn, params: &GrenadParameters) -> Result<Self::Output> {
|
||||||
|
let mut m1 = MergerBuilder::new(merge_fn);
|
||||||
|
let mut m2 = MergerBuilder::new(merge_fn);
|
||||||
|
for (r1, r2) in self.into_iter() {
|
||||||
|
m1.push(r1)?;
|
||||||
|
m2.push(r2)?;
|
||||||
|
}
|
||||||
|
Ok((m1.finish(params)?, m2.finish(params)?))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct MergerBuilder<R>(grenad::MergerBuilder<R, MergeFn>);
|
||||||
|
|
||||||
|
impl<R: io::Read + io::Seek> MergerBuilder<R> {
|
||||||
|
fn new(merge_fn: MergeFn) -> Self {
|
||||||
|
Self(grenad::MergerBuilder::new(merge_fn))
|
||||||
}
|
}
|
||||||
|
|
||||||
let merger = merger_builder.build();
|
fn push(&mut self, reader: grenad::Reader<R>) -> Result<()> {
|
||||||
let mut writer = create_writer(
|
self.0.push(reader.into_cursor()?);
|
||||||
indexer.chunk_compression_type,
|
Ok(())
|
||||||
indexer.chunk_compression_level,
|
}
|
||||||
tempfile::tempfile()?,
|
|
||||||
);
|
|
||||||
merger.write_into_stream_writer(&mut writer)?;
|
|
||||||
|
|
||||||
Ok(writer_into_reader(writer)?)
|
fn finish(self, params: &GrenadParameters) -> Result<grenad::Reader<File>> {
|
||||||
|
let merger = self.0.build();
|
||||||
|
let mut writer = create_writer(
|
||||||
|
params.chunk_compression_type,
|
||||||
|
params.chunk_compression_level,
|
||||||
|
tempfile::tempfile()?,
|
||||||
|
);
|
||||||
|
merger.write_into_stream_writer(&mut writer)?;
|
||||||
|
|
||||||
|
Ok(writer_into_reader(writer)?)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
|
@ -8,9 +8,9 @@ use std::convert::{TryFrom, TryInto};
|
|||||||
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
pub use clonable_mmap::{ClonableMmap, CursorClonableMmap};
|
||||||
use fst::{IntoStreamer, Streamer};
|
use fst::{IntoStreamer, Streamer};
|
||||||
pub use grenad_helpers::{
|
pub use grenad_helpers::{
|
||||||
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks, merge_readers,
|
as_cloneable_grenad, create_sorter, create_writer, grenad_obkv_into_chunks,
|
||||||
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
|
sorter_into_lmdb_database, sorter_into_reader, write_into_lmdb_database, writer_into_reader,
|
||||||
GrenadParameters,
|
GrenadParameters, MergeableReader,
|
||||||
};
|
};
|
||||||
pub use merge_functions::{
|
pub use merge_functions::{
|
||||||
concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,
|
concat_u32s_array, keep_first, keep_first_prefix_value_merge_roaring_bitmaps, keep_latest_obkv,
|
||||||
|
Loading…
Reference in New Issue
Block a user