This commit is contained in:
Clément Renault 2024-10-29 14:38:52 +01:00
parent 82f6e3f3b9
commit 31680f3014
No known key found for this signature in database
GPG Key ID: F250A4C4E3AE5F5F
8 changed files with 151 additions and 191 deletions

View File

@ -1,8 +1,6 @@
use std::fs::File;
use std::marker::PhantomData; use std::marker::PhantomData;
use crossbeam_channel::{IntoIter, Receiver, SendError, Sender}; use crossbeam_channel::{IntoIter, Receiver, SendError, Sender};
use grenad::Merger;
use heed::types::Bytes; use heed::types::Bytes;
use memmap2::Mmap; use memmap2::Mmap;
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
@ -10,8 +8,8 @@ use roaring::RoaringBitmap;
use super::extract::FacetKind; use super::extract::FacetKind;
use super::StdResult; use super::StdResult;
use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY}; use crate::index::main_key::{DOCUMENTS_IDS_KEY, WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
use crate::update::new::extract::CboCachedSorter;
use crate::update::new::KvReaderFieldId; use crate::update::new::KvReaderFieldId;
use crate::update::MergeDeladdCboRoaringBitmaps;
use crate::{DocumentId, Index}; use crate::{DocumentId, Index};
/// The capacity of the channel is currently in number of messages. /// The capacity of the channel is currently in number of messages.
@ -29,7 +27,9 @@ pub fn merger_writer_channel(cap: usize) -> (MergerSender, WriterReceiver) {
} }
/// The capacity of the channel is currently in number of messages. /// The capacity of the channel is currently in number of messages.
pub fn extractors_merger_channels(cap: usize) -> (ExtractorSender, MergerReceiver) { pub fn extractors_merger_channels<'extractor>(
cap: usize,
) -> (ExtractorSender<'extractor>, MergerReceiver<'extractor>) {
let (sender, receiver) = crossbeam_channel::bounded(cap); let (sender, receiver) = crossbeam_channel::bounded(cap);
(ExtractorSender(sender), MergerReceiver(receiver)) (ExtractorSender(sender), MergerReceiver(receiver))
} }
@ -313,7 +313,9 @@ pub trait DatabaseType {
} }
pub trait MergerOperationType { pub trait MergerOperationType {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation; fn new_merger_operation<'extractor>(
caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor>;
} }
impl DatabaseType for ExactWordDocids { impl DatabaseType for ExactWordDocids {
@ -321,8 +323,10 @@ impl DatabaseType for ExactWordDocids {
} }
impl MergerOperationType for ExactWordDocids { impl MergerOperationType for ExactWordDocids {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation { fn new_merger_operation<'extractor>(
MergerOperation::ExactWordDocidsMerger(merger) caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor> {
MergerOperation::ExactWordDocidsMerger(caches)
} }
} }
@ -331,8 +335,10 @@ impl DatabaseType for FidWordCountDocids {
} }
impl MergerOperationType for FidWordCountDocids { impl MergerOperationType for FidWordCountDocids {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation { fn new_merger_operation<'extractor>(
MergerOperation::FidWordCountDocidsMerger(merger) caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor> {
MergerOperation::FidWordCountDocidsMerger(caches)
} }
} }
@ -341,8 +347,10 @@ impl DatabaseType for WordDocids {
} }
impl MergerOperationType for WordDocids { impl MergerOperationType for WordDocids {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation { fn new_merger_operation<'extractor>(
MergerOperation::WordDocidsMerger(merger) caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor> {
MergerOperation::WordDocidsMerger(caches)
} }
} }
@ -351,8 +359,10 @@ impl DatabaseType for WordFidDocids {
} }
impl MergerOperationType for WordFidDocids { impl MergerOperationType for WordFidDocids {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation { fn new_merger_operation<'extractor>(
MergerOperation::WordFidDocidsMerger(merger) caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor> {
MergerOperation::WordFidDocidsMerger(caches)
} }
} }
@ -361,8 +371,10 @@ impl DatabaseType for WordPairProximityDocids {
} }
impl MergerOperationType for WordPairProximityDocids { impl MergerOperationType for WordPairProximityDocids {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation { fn new_merger_operation<'extractor>(
MergerOperation::WordPairProximityDocidsMerger(merger) caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor> {
MergerOperation::WordPairProximityDocidsMerger(caches)
} }
} }
@ -371,14 +383,18 @@ impl DatabaseType for WordPositionDocids {
} }
impl MergerOperationType for WordPositionDocids { impl MergerOperationType for WordPositionDocids {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation { fn new_merger_operation<'extractor>(
MergerOperation::WordPositionDocidsMerger(merger) caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor> {
MergerOperation::WordPositionDocidsMerger(caches)
} }
} }
impl MergerOperationType for FacetDocids { impl MergerOperationType for FacetDocids {
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation { fn new_merger_operation<'extractor>(
MergerOperation::FacetDocidsMerger(merger) caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> MergerOperation<'extractor> {
MergerOperation::FacetDocidsMerger(caches)
} }
} }
@ -489,23 +505,23 @@ impl DocumentsSender<'_> {
} }
} }
pub enum MergerOperation { pub enum MergerOperation<'extractor> {
ExactWordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), ExactWordDocidsMerger(Vec<Vec<CboCachedSorter<'extractor>>>),
FidWordCountDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), FidWordCountDocidsMerger(Vec<Vec<CboCachedSorter<'extractor>>>),
WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), WordDocidsMerger(Vec<Vec<CboCachedSorter<'extractor>>>),
WordFidDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), WordFidDocidsMerger(Vec<Vec<CboCachedSorter<'extractor>>>),
WordPairProximityDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), WordPairProximityDocidsMerger(Vec<Vec<CboCachedSorter<'extractor>>>),
WordPositionDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), WordPositionDocidsMerger(Vec<Vec<CboCachedSorter<'extractor>>>),
FacetDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>), FacetDocidsMerger(Vec<Vec<CboCachedSorter<'extractor>>>),
DeleteDocument { docid: DocumentId, external_id: String }, DeleteDocument { docid: DocumentId, external_id: String },
InsertDocument { docid: DocumentId, external_id: String, document: Box<KvReaderFieldId> }, InsertDocument { docid: DocumentId, external_id: String, document: Box<KvReaderFieldId> },
FinishedDocument, FinishedDocument,
} }
pub struct MergerReceiver(Receiver<MergerOperation>); pub struct MergerReceiver<'extractor>(Receiver<MergerOperation<'extractor>>);
impl IntoIterator for MergerReceiver { impl<'extractor> IntoIterator for MergerReceiver<'extractor> {
type Item = MergerOperation; type Item = MergerOperation<'extractor>;
type IntoIter = IntoIter<Self::Item>; type IntoIter = IntoIter<Self::Item>;
fn into_iter(self) -> Self::IntoIter { fn into_iter(self) -> Self::IntoIter {
@ -513,27 +529,27 @@ impl IntoIterator for MergerReceiver {
} }
} }
pub struct ExtractorSender(Sender<MergerOperation>); pub struct ExtractorSender<'extractor>(Sender<MergerOperation<'extractor>>);
impl ExtractorSender { impl<'extractor> ExtractorSender<'extractor> {
pub fn document_sender(&self) -> DocumentSender<'_> { pub fn document_sender(&self) -> DocumentSender<'_, 'extractor> {
DocumentSender(Some(&self.0)) DocumentSender(Some(&self.0))
} }
pub fn send_searchable<D: MergerOperationType>( pub fn send_searchable<D: MergerOperationType>(
&self, &self,
merger: Merger<File, MergeDeladdCboRoaringBitmaps>, caches: Vec<Vec<CboCachedSorter<'extractor>>>,
) -> StdResult<(), SendError<()>> { ) -> StdResult<(), SendError<()>> {
match self.0.send(D::new_merger_operation(merger)) { match self.0.send(D::new_merger_operation(caches)) {
Ok(()) => Ok(()), Ok(()) => Ok(()),
Err(SendError(_)) => Err(SendError(())), Err(SendError(_)) => Err(SendError(())),
} }
} }
} }
pub struct DocumentSender<'a>(Option<&'a Sender<MergerOperation>>); pub struct DocumentSender<'a, 'extractor>(Option<&'a Sender<MergerOperation<'extractor>>>);
impl DocumentSender<'_> { impl DocumentSender<'_, '_> {
pub fn insert( pub fn insert(
&self, &self,
docid: DocumentId, docid: DocumentId,
@ -564,7 +580,7 @@ impl DocumentSender<'_> {
} }
} }
impl Drop for DocumentSender<'_> { impl Drop for DocumentSender<'_, '_> {
fn drop(&mut self) { fn drop(&mut self) {
if let Some(sender) = self.0.take() { if let Some(sender) = self.0.take() {
let _ = sender.send(MergerOperation::FinishedDocument); let _ = sender.send(MergerOperation::FinishedDocument);

View File

@ -392,7 +392,7 @@ pub fn transpose_and_freeze_caches<'a, 'extractor>(
/// # Panics /// # Panics
/// ///
/// - If the bucket IDs in these frozen caches are not exactly the same. /// - If the bucket IDs in these frozen caches are not exactly the same.
pub fn merge_caches<F>(frozen: Vec<FrozenCache>, mut iter: F) -> Result<()> pub fn merge_caches<F>(frozen: Vec<FrozenCache>, mut f: F) -> Result<()>
where where
F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>, F: for<'a> FnMut(&'a [u8], DelAddRoaringBitmap) -> Result<()>,
{ {
@ -455,7 +455,7 @@ where
} }
// We send the merged entry outside. // We send the merged entry outside.
(iter)(first_key, output)?; (f)(first_key, output)?;
// Don't forget to put the first entry back into the heap. // Don't forget to put the first entry back into the heap.
if first_entry.cursor.move_on_next()?.is_some() { if first_entry.cursor.move_on_next()?.is_some() {
@ -478,7 +478,7 @@ where
} }
// We send the merged entry outside. // We send the merged entry outside.
(iter)(key, output)?; (f)(key, output)?;
} }
} }
} }

View File

@ -1,12 +1,9 @@
use std::cell::RefCell; use std::cell::RefCell;
use std::collections::HashSet; use std::collections::HashSet;
use std::fs::File;
use std::ops::DerefMut as _; use std::ops::DerefMut as _;
use bumpalo::Bump; use bumpalo::Bump;
use grenad::Merger;
use heed::RoTxn; use heed::RoTxn;
use raw_collections::alloc::RefBump;
use serde_json::Value; use serde_json::Value;
use super::super::cache::CboCachedSorter; use super::super::cache::CboCachedSorter;
@ -19,16 +16,16 @@ use crate::update::new::indexer::document_changes::{
IndexingContext, RefCellExt, ThreadLocal, IndexingContext, RefCellExt, ThreadLocal,
}; };
use crate::update::new::DocumentChange; use crate::update::new::DocumentChange;
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::update::GrenadParameters;
use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH}; use crate::{DocumentId, FieldId, Index, Result, MAX_FACET_VALUE_LENGTH};
pub struct FacetedExtractorData<'extractor> { pub struct FacetedExtractorData<'a> {
attributes_to_extract: &'extractor [&'extractor str], attributes_to_extract: &'a [&'a str],
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
max_memory: Option<usize>, max_memory: Option<usize>,
} }
impl<'extractor> Extractor<'extractor> for FacetedExtractorData<'extractor> { impl<'a, 'extractor> Extractor<'extractor> for FacetedExtractorData<'a> {
type Data = RefCell<CboCachedSorter<'extractor>>; type Data = RefCell<CboCachedSorter<'extractor>>;
fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> { fn init_data(&self, extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
@ -217,12 +214,12 @@ fn truncate_str(s: &str) -> &str {
impl DocidsExtractor for FacetedDocidsExtractor { impl DocidsExtractor for FacetedDocidsExtractor {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
document_changes: &DC, document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index>, indexing_context: IndexingContext<'fid, 'indexer, 'index>,
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>, extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> { ) -> Result<Vec<CboCachedSorter<'extractor>>> {
let max_memory = grenad_parameters.max_memory_by_thread(); let max_memory = grenad_parameters.max_memory_by_thread();
let index = indexing_context.index; let index = indexing_context.index;
@ -251,26 +248,7 @@ impl DocidsExtractor for FacetedDocidsExtractor {
&datastore, &datastore,
)?; )?;
} }
{
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
let span =
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
let _entered = span.enter();
let readers: Vec<_> = datastore Ok(datastore.into_iter().map(RefCell::into_inner).collect())
.into_iter()
// .par_bridge() // T is !Send
.map(|cached_sorter| {
let cached_sorter = cached_sorter.into_inner();
let sorter = cached_sorter.into_sorter()?;
sorter.into_reader_cursors()
})
.collect();
for reader in readers {
builder.extend(reader?);
}
Ok(builder.build())
}
} }
} }

View File

@ -2,25 +2,22 @@ mod cache;
mod faceted; mod faceted;
mod searchable; mod searchable;
use std::cell::RefCell;
use std::fs::File;
use bumpalo::Bump; use bumpalo::Bump;
pub use cache::{merge_caches, transpose_and_freeze_caches, CboCachedSorter, DelAddRoaringBitmap};
pub use faceted::*; pub use faceted::*;
use grenad::Merger;
pub use searchable::*; pub use searchable::*;
use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal}; use super::indexer::document_changes::{DocumentChanges, FullySend, IndexingContext, ThreadLocal};
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::update::GrenadParameters;
use crate::Result; use crate::Result;
pub trait DocidsExtractor { pub trait DocidsExtractor {
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
document_changes: &DC, document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index>, indexing_context: IndexingContext<'fid, 'indexer, 'index>,
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>, extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>>; ) -> Result<Vec<CboCachedSorter<'extractor>>>;
} }
/// TODO move in permissive json pointer /// TODO move in permissive json pointer

View File

@ -7,7 +7,6 @@ use std::ops::DerefMut as _;
use bumpalo::Bump; use bumpalo::Bump;
use grenad::{Merger, MergerBuilder}; use grenad::{Merger, MergerBuilder};
use heed::RoTxn; use heed::RoTxn;
use raw_collections::alloc::RefBump;
use super::tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::update::new::extract::cache::CboCachedSorter; use crate::update::new::extract::cache::CboCachedSorter;
@ -157,15 +156,15 @@ struct WordDocidsMergerBuilders {
fid_word_count_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>, fid_word_count_docids: MergerBuilder<File, MergeDeladdCboRoaringBitmaps>,
} }
pub struct WordDocidsMergers { pub struct WordDocidsMergers<'extractor> {
pub word_fid_docids: Merger<File, MergeDeladdCboRoaringBitmaps>, pub word_fid_docids: Vec<Vec<CboCachedSorter<'extractor>>>,
pub word_docids: Merger<File, MergeDeladdCboRoaringBitmaps>, pub word_docids: Vec<Vec<CboCachedSorter<'extractor>>>,
pub exact_word_docids: Merger<File, MergeDeladdCboRoaringBitmaps>, pub exact_word_docids: Vec<Vec<CboCachedSorter<'extractor>>>,
pub word_position_docids: Merger<File, MergeDeladdCboRoaringBitmaps>, pub word_position_docids: Vec<Vec<CboCachedSorter<'extractor>>>,
pub fid_word_count_docids: Merger<File, MergeDeladdCboRoaringBitmaps>, pub fid_word_count_docids: Vec<Vec<CboCachedSorter<'extractor>>>,
} }
impl WordDocidsMergerBuilders { impl<'extractor> WordDocidsMergerBuilders<'extractor> {
fn new() -> Self { fn new() -> Self {
Self { Self {
word_fid_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps), word_fid_docids: MergerBuilder::new(MergeDeladdCboRoaringBitmaps),
@ -202,7 +201,7 @@ impl WordDocidsMergerBuilders {
Ok(()) Ok(())
} }
fn build(self) -> WordDocidsMergers { fn build(self) -> WordDocidsMergers<'extractor> {
WordDocidsMergers { WordDocidsMergers {
word_fid_docids: self.word_fid_docids.build(), word_fid_docids: self.word_fid_docids.build(),
word_docids: self.word_docids.build(), word_docids: self.word_docids.build(),

View File

@ -9,9 +9,7 @@ use std::marker::PhantomData;
use bumpalo::Bump; use bumpalo::Bump;
pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers}; pub use extract_word_docids::{WordDocidsExtractors, WordDocidsMergers};
pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor; pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
use grenad::Merger;
use heed::RoTxn; use heed::RoTxn;
use raw_collections::alloc::RefBump;
use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::cache::CboCachedSorter; use super::cache::CboCachedSorter;
@ -21,18 +19,18 @@ use crate::update::new::indexer::document_changes::{
IndexingContext, ThreadLocal, IndexingContext, ThreadLocal,
}; };
use crate::update::new::DocumentChange; use crate::update::new::DocumentChange;
use crate::update::{GrenadParameters, MergeDeladdCboRoaringBitmaps}; use crate::update::GrenadParameters;
use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE}; use crate::{Index, Result, MAX_POSITION_PER_ATTRIBUTE};
pub struct SearchableExtractorData<'extractor, EX: SearchableExtractor> { pub struct SearchableExtractorData<'a, EX: SearchableExtractor> {
tokenizer: &'extractor DocumentTokenizer<'extractor>, tokenizer: &'a DocumentTokenizer<'a>,
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
max_memory: Option<usize>, max_memory: Option<usize>,
_ex: PhantomData<EX>, _ex: PhantomData<EX>,
} }
impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor> impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
for SearchableExtractorData<'extractor, EX> for SearchableExtractorData<'a, EX>
{ {
type Data = RefCell<CboCachedSorter<'extractor>>; type Data = RefCell<CboCachedSorter<'extractor>>;
@ -50,12 +48,12 @@ impl<'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
} }
pub trait SearchableExtractor: Sized + Sync { pub trait SearchableExtractor: Sized + Sync {
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
document_changes: &DC, document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index>, indexing_context: IndexingContext<'fid, 'indexer, 'index>,
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>, extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> { ) -> Result<Vec<CboCachedSorter<'extractor>>> {
let max_memory = grenad_parameters.max_memory_by_thread(); let max_memory = grenad_parameters.max_memory_by_thread();
let rtxn = indexing_context.index.read_txn()?; let rtxn = indexing_context.index.read_txn()?;
@ -107,28 +105,8 @@ pub trait SearchableExtractor: Sized + Sync {
&datastore, &datastore,
)?; )?;
} }
{
let mut builder = grenad::MergerBuilder::new(MergeDeladdCboRoaringBitmaps);
let span =
tracing::trace_span!(target: "indexing::documents::extract", "merger_building");
let _entered = span.enter();
let readers: Vec<_> = datastore Ok(datastore.into_iter().map(RefCell::into_inner).collect())
.into_iter()
// .par_bridge() // T is !Send
.map(|cache_sorter| {
let cached_sorter = cache_sorter.into_inner();
let sorter = cached_sorter.into_sorter()?;
sorter.into_reader_cursors()
})
.collect();
for reader in readers {
builder.extend(reader?);
}
Ok(builder.build())
}
} }
fn extract_document_change( fn extract_document_change(
@ -144,12 +122,12 @@ pub trait SearchableExtractor: Sized + Sync {
} }
impl<T: SearchableExtractor> DocidsExtractor for T { impl<T: SearchableExtractor> DocidsExtractor for T {
fn run_extraction<'pl, 'fid, 'indexer, 'index, DC: DocumentChanges<'pl>>( fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>>(
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
document_changes: &DC, document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index>, indexing_context: IndexingContext<'fid, 'indexer, 'index>,
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>, extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
) -> Result<Merger<File, MergeDeladdCboRoaringBitmaps>> { ) -> Result<Vec<CboCachedSorter<'extractor>>> {
Self::run_extraction( Self::run_extraction(
grenad_parameters, grenad_parameters,
document_changes, document_changes,

View File

@ -20,7 +20,7 @@ use super::channel::*;
use super::document::write_to_obkv; use super::document::write_to_obkv;
use super::document_change::DocumentChange; use super::document_change::DocumentChange;
use super::extract::*; use super::extract::*;
use super::merger::{merge_grenad_entries, FacetFieldIdsDelta}; use super::merger::{merge_caches_entries, FacetFieldIdsDelta};
use super::word_fst_builder::PrefixDelta; use super::word_fst_builder::PrefixDelta;
use super::words_prefix_docids::{ use super::words_prefix_docids::{
compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids, compute_word_prefix_docids, compute_word_prefix_fid_docids, compute_word_prefix_position_docids,
@ -33,7 +33,7 @@ use crate::update::new::channel::ExtractorSender;
use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids; use crate::update::new::words_prefix_docids::compute_exact_word_prefix_docids;
use crate::update::settings::InnerIndexSettings; use crate::update::settings::InnerIndexSettings;
use crate::update::{FacetsUpdateBulk, GrenadParameters}; use crate::update::{FacetsUpdateBulk, GrenadParameters};
use crate::{Error, FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError}; use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, Result, UserError};
pub(crate) mod de; pub(crate) mod de;
pub mod document_changes; pub mod document_changes;
@ -42,11 +42,11 @@ mod document_operation;
mod partial_dump; mod partial_dump;
mod update_by_function; mod update_by_function;
struct DocumentExtractor<'a> { struct DocumentExtractor<'a, 'extractor> {
document_sender: &'a DocumentSender<'a>, document_sender: &'a DocumentSender<'a, 'extractor>,
} }
impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a> { impl<'a, 'extractor> Extractor<'extractor> for DocumentExtractor<'a, 'extractor> {
type Data = FullySend<()>; type Data = FullySend<()>;
fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> { fn init_data(&self, _extractor_alloc: &'extractor Bump) -> Result<Self::Data> {
@ -179,6 +179,7 @@ where
word_position_docids, word_position_docids,
fid_word_count_docids, fid_word_count_docids,
} = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?; } = WordDocidsExtractors::run_extraction(grenad_parameters, document_changes, indexing_context, &mut extractor_allocs)?;
extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap(); extractor_sender.send_searchable::<WordDocids>(word_docids).unwrap();
extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap(); extractor_sender.send_searchable::<WordFidDocids>(word_fid_docids).unwrap();
extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap(); extractor_sender.send_searchable::<ExactWordDocids>(exact_word_docids).unwrap();
@ -201,7 +202,7 @@ where
grenad_parameters, grenad_parameters,
document_changes, document_changes,
indexing_context, indexing_context,
&mut extractor_allocs, &mut extractor_allocs,
&extractor_sender, &extractor_sender,
)?; )?;
} }
@ -239,7 +240,7 @@ where
tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "merge"); tracing::trace_span!(target: "indexing::documents", parent: &indexer_span, "merge");
let _entered = span.enter(); let _entered = span.enter();
let rtxn = index.read_txn().unwrap(); let rtxn = index.read_txn().unwrap();
merge_grenad_entries( merge_caches_entries(
merger_receiver, merger_receiver,
merger_sender, merger_sender,
&rtxn, &rtxn,
@ -352,6 +353,7 @@ fn extract_and_send_docids<
'fid, 'fid,
'indexer, 'indexer,
'index, 'index,
'extractor,
DC: DocumentChanges<'pl>, DC: DocumentChanges<'pl>,
E: DocidsExtractor, E: DocidsExtractor,
D: MergerOperationType, D: MergerOperationType,
@ -359,12 +361,12 @@ fn extract_and_send_docids<
grenad_parameters: GrenadParameters, grenad_parameters: GrenadParameters,
document_changes: &DC, document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index>, indexing_context: IndexingContext<'fid, 'indexer, 'index>,
extractor_allocs: &mut ThreadLocal<FullySend<Bump>>, extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
sender: &ExtractorSender, sender: &ExtractorSender<'extractor>,
) -> Result<()> { ) -> Result<()> {
let merger = let caches =
E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?; E::run_extraction(grenad_parameters, document_changes, indexing_context, extractor_allocs)?;
sender.send_searchable::<D>(merger).unwrap(); sender.send_searchable::<D>(caches).unwrap();
Ok(()) Ok(())
} }

View File

@ -6,13 +6,16 @@ use grenad::Merger;
use hashbrown::HashSet; use hashbrown::HashSet;
use heed::types::Bytes; use heed::types::Bytes;
use heed::{Database, RoTxn}; use heed::{Database, RoTxn};
use rayon::iter::{IntoParallelIterator, ParallelIterator};
use roaring::RoaringBitmap; use roaring::RoaringBitmap;
use super::channel::*; use super::channel::*;
use super::extract::FacetKind; use super::extract::{
merge_caches, transpose_and_freeze_caches, CboCachedSorter, DelAddRoaringBitmap, FacetKind,
};
use super::word_fst_builder::{PrefixData, PrefixDelta}; use super::word_fst_builder::{PrefixData, PrefixDelta};
use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId}; use super::{Deletion, DocumentChange, KvReaderDelAdd, KvReaderFieldId};
use crate::update::del_add::DelAdd; use crate::update::del_add::{DelAdd, DelAddOperation};
use crate::update::new::channel::MergerOperation; use crate::update::new::channel::MergerOperation;
use crate::update::new::word_fst_builder::WordFstBuilder; use crate::update::new::word_fst_builder::WordFstBuilder;
use crate::update::MergeDeladdCboRoaringBitmaps; use crate::update::MergeDeladdCboRoaringBitmaps;
@ -20,7 +23,7 @@ use crate::{CboRoaringBitmapCodec, Error, FieldId, GeoPoint, GlobalFieldsIdsMap,
/// TODO We must return some infos/stats /// TODO We must return some infos/stats
#[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::documents", name = "merge")]
pub fn merge_grenad_entries( pub fn merge_caches_entries(
receiver: MergerReceiver, receiver: MergerReceiver,
sender: MergerSender, sender: MergerSender,
rtxn: &RoTxn, rtxn: &RoTxn,
@ -34,12 +37,12 @@ pub fn merge_grenad_entries(
for merger_operation in receiver { for merger_operation in receiver {
match merger_operation { match merger_operation {
MergerOperation::ExactWordDocidsMerger(merger) => { MergerOperation::ExactWordDocidsMerger(caches) => {
let span = let span =
tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids"); tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
let _entered = span.enter(); let _entered = span.enter();
merge_and_send_docids( merge_and_send_docids(
merger, caches,
/// TODO do a MergerOperation::database(&Index) -> Database<Bytes, Bytes>. /// TODO do a MergerOperation::database(&Index) -> Database<Bytes, Bytes>.
index.exact_word_docids.remap_types(), index.exact_word_docids.remap_types(),
rtxn, rtxn,
@ -192,8 +195,6 @@ pub fn merge_grenad_entries(
sender.send_documents_ids(documents_ids).unwrap(); sender.send_documents_ids(documents_ids).unwrap();
} }
// ...
Ok(merger_result) Ok(merger_result)
} }
@ -254,69 +255,61 @@ impl GeoExtractor {
} }
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
fn merge_and_send_docids( fn merge_and_send_docids<'extractor>(
merger: Merger<File, MergeDeladdCboRoaringBitmaps>, mut caches: Vec<Vec<CboCachedSorter<'extractor>>>,
database: Database<Bytes, Bytes>, database: Database<Bytes, Bytes>,
rtxn: &RoTxn<'_>, rtxn: &RoTxn<'_>,
buffer: &mut Vec<u8>, buffer: &mut Vec<u8>,
docids_sender: impl DocidsSender, docids_sender: impl DocidsSender,
mut register_key: impl FnMut(DelAdd, &[u8]) -> Result<()>, mut register_key: impl FnMut(DelAdd, &[u8]) -> Result<()>,
) -> Result<()> { ) -> Result<()> {
let mut merger_iter = merger.into_stream_merger_iter().unwrap(); transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| {
while let Some((key, deladd)) = merger_iter.next().unwrap() { merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
let current = database.get(rtxn, key)?; let current = database.get(rtxn, key)?;
let deladd: &KvReaderDelAdd = deladd.into(); match merge_cbo_bitmaps(current, del, add)? {
let del = deladd.get(DelAdd::Deletion); Operation::Write(bitmap) => {
let add = deladd.get(DelAdd::Addition); let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer);
docids_sender.write(key, value).unwrap();
match merge_cbo_bitmaps(current, del, add)? { register_key(DelAdd::Addition, key)
Operation::Write(bitmap) => { }
let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); Operation::Delete => {
docids_sender.write(key, value).unwrap(); docids_sender.delete(key).unwrap();
register_key(DelAdd::Addition, key)?; register_key(DelAdd::Deletion, key)
}
Operation::Ignore => Ok(()),
} }
Operation::Delete => { })
docids_sender.delete(key).unwrap(); })
register_key(DelAdd::Deletion, key)?;
}
Operation::Ignore => (),
}
}
Ok(())
} }
#[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")] #[tracing::instrument(level = "trace", skip_all, target = "indexing::merge")]
fn merge_and_send_facet_docids( fn merge_and_send_facet_docids<'extractor>(
merger: Merger<File, MergeDeladdCboRoaringBitmaps>, mut caches: Vec<Vec<CboCachedSorter<'extractor>>>,
database: FacetDatabases, database: FacetDatabases,
rtxn: &RoTxn<'_>, rtxn: &RoTxn<'_>,
buffer: &mut Vec<u8>, buffer: &mut Vec<u8>,
docids_sender: impl DocidsSender, docids_sender: impl DocidsSender,
facet_field_ids_delta: &mut FacetFieldIdsDelta, facet_field_ids_delta: &mut FacetFieldIdsDelta,
) -> Result<()> { ) -> Result<()> {
let mut merger_iter = merger.into_stream_merger_iter().unwrap(); transpose_and_freeze_caches(&mut caches)?.into_par_iter().try_for_each(|frozen| {
while let Some((key, deladd)) = merger_iter.next().unwrap() { merge_caches(frozen, |key, DelAddRoaringBitmap { del, add }| {
let current = database.get_cbo_roaring_bytes_value(rtxn, key)?; let current = database.get_cbo_roaring_bytes_value(rtxn, key)?;
let deladd: &KvReaderDelAdd = deladd.into(); match merge_cbo_bitmaps(current, del, add)? {
let del = deladd.get(DelAdd::Deletion); Operation::Write(bitmap) => {
let add = deladd.get(DelAdd::Addition); facet_field_ids_delta.register_from_key(key);
let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer);
match merge_cbo_bitmaps(current, del, add)? { docids_sender.write(key, value).unwrap();
Operation::Write(bitmap) => { Ok(())
facet_field_ids_delta.register_from_key(key); }
let value = cbo_bitmap_serialize_into_vec(&bitmap, buffer); Operation::Delete => {
docids_sender.write(key, value).unwrap(); facet_field_ids_delta.register_from_key(key);
docids_sender.delete(key).unwrap();
Ok(())
}
Operation::Ignore => Ok(()),
} }
Operation::Delete => { })
facet_field_ids_delta.register_from_key(key); })
docids_sender.delete(key).unwrap();
}
Operation::Ignore => (),
}
}
Ok(())
} }
struct FacetDatabases<'a> { struct FacetDatabases<'a> {
@ -409,13 +402,10 @@ enum Operation {
/// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap. /// A function that merges the DelAdd CboRoaringBitmaps with the current bitmap.
fn merge_cbo_bitmaps( fn merge_cbo_bitmaps(
current: Option<&[u8]>, current: Option<&[u8]>,
del: Option<&[u8]>, del: Option<RoaringBitmap>,
add: Option<&[u8]>, add: Option<RoaringBitmap>,
) -> Result<Operation> { ) -> Result<Operation> {
let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?; let current = current.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
let del = del.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
let add = add.map(CboRoaringBitmapCodec::deserialize_from).transpose()?;
match (current, del, add) { match (current, del, add) {
(None, None, None) => Ok(Operation::Ignore), // but it's strange (None, None, None) => Ok(Operation::Ignore), // but it's strange
(None, None, Some(add)) => Ok(Operation::Write(add)), (None, None, Some(add)) => Ok(Operation::Write(add)),