mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-07-03 03:47:02 +02:00
Merge #5147
5147: Batch progress r=dureuill a=irevoire # Pull Request ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/5068 ## What does this PR do? - ... ## PR checklist Please check if your PR fulfills the following requirements: - [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)? - [ ] Have you read the contributing guidelines? - [ ] Have you made sure that the title is accurate and descriptive of the changes? Thank you so much for contributing to Meilisearch! Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
commit
1fc90fbacb
38 changed files with 940 additions and 473 deletions
|
@ -766,6 +766,7 @@ mod tests {
|
|||
use crate::documents::mmap_from_objects;
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::index::IndexEmbeddingConfig;
|
||||
use crate::progress::Progress;
|
||||
use crate::search::TermsMatchingStrategy;
|
||||
use crate::update::new::indexer;
|
||||
use crate::update::Setting;
|
||||
|
@ -1964,7 +1965,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2148,7 +2149,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2163,7 +2164,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2210,7 +2211,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2225,7 +2226,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2263,7 +2264,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2278,7 +2279,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2315,7 +2316,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2330,7 +2331,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2369,7 +2370,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2384,7 +2385,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2428,7 +2429,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2443,7 +2444,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2480,7 +2481,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2495,7 +2496,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2532,7 +2533,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2547,7 +2548,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2726,7 +2727,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2741,7 +2742,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2785,7 +2786,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2800,7 +2801,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
@ -2841,7 +2842,7 @@ mod tests {
|
|||
None,
|
||||
&mut new_fields_ids_map,
|
||||
&|| false,
|
||||
&|_progress| (),
|
||||
Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
@ -2856,7 +2857,7 @@ mod tests {
|
|||
&document_changes,
|
||||
embedders,
|
||||
&|| false,
|
||||
&|_| (),
|
||||
&Progress::default(),
|
||||
)
|
||||
.unwrap();
|
||||
wtxn.commit().unwrap();
|
||||
|
|
|
@ -16,10 +16,10 @@ use crate::update::del_add::DelAdd;
|
|||
use crate::update::new::channel::FieldIdDocidFacetSender;
|
||||
use crate::update::new::extract::perm_json_p;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
|
@ -373,26 +373,16 @@ fn truncate_str(s: &str) -> &str {
|
|||
|
||||
impl FacetedDocidsExtractor {
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
|
||||
pub fn run_extraction<
|
||||
'pl,
|
||||
'fid,
|
||||
'indexer,
|
||||
'index,
|
||||
'extractor,
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP,
|
||||
SP,
|
||||
>(
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
sender: &FieldIdDocidFacetSender,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let index = indexing_context.index;
|
||||
let rtxn = index.read_txn()?;
|
||||
|
|
|
@ -15,23 +15,22 @@ pub use geo::*;
|
|||
pub use searchable::*;
|
||||
pub use vectors::EmbeddingExtractor;
|
||||
|
||||
use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress};
|
||||
use super::steps::Step;
|
||||
use super::indexer::document_changes::{DocumentChanges, IndexingContext};
|
||||
use super::steps::IndexingStep;
|
||||
use super::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::GrenadParameters;
|
||||
use crate::Result;
|
||||
|
||||
pub trait DocidsExtractor {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync;
|
||||
MSP: Fn() -> bool + Sync;
|
||||
}
|
||||
|
||||
/// TODO move in permissive json pointer
|
||||
|
|
|
@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
|||
use crate::update::new::extract::cache::BalancedCaches;
|
||||
use crate::update::new::extract::perm_json_p::contained_in;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::ref_cell_ext::RefCellExt as _;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
|
@ -239,25 +239,15 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
|
|||
pub struct WordDocidsExtractors;
|
||||
|
||||
impl WordDocidsExtractors {
|
||||
pub fn run_extraction<
|
||||
'pl,
|
||||
'fid,
|
||||
'indexer,
|
||||
'index,
|
||||
'extractor,
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP,
|
||||
SP,
|
||||
>(
|
||||
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<WordDocidsCaches<'extractor>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let index = indexing_context.index;
|
||||
let rtxn = index.read_txn()?;
|
||||
|
|
|
@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer};
|
|||
use super::cache::BalancedCaches;
|
||||
use super::DocidsExtractor;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
|
||||
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::GrenadParameters;
|
||||
|
@ -56,16 +56,15 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
|
|||
}
|
||||
|
||||
pub trait SearchableExtractor: Sized + Sync {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let rtxn = indexing_context.index.read_txn()?;
|
||||
let stop_words = indexing_context.index.stop_words(&rtxn)?;
|
||||
|
@ -134,16 +133,15 @@ pub trait SearchableExtractor: Sized + Sync {
|
|||
}
|
||||
|
||||
impl<T: SearchableExtractor> DocidsExtractor for T {
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
|
||||
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
|
||||
grenad_parameters: GrenadParameters,
|
||||
document_changes: &DC,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<Vec<BalancedCaches<'extractor>>>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
Self::run_extraction(
|
||||
grenad_parameters,
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
use std::cell::{Cell, RefCell};
|
||||
use std::sync::atomic::Ordering;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
use bumpalo::Bump;
|
||||
|
@ -7,8 +8,9 @@ use rayon::iter::IndexedParallelIterator;
|
|||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
|
||||
use crate::progress::{AtomicDocumentStep, Progress};
|
||||
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
|
||||
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result};
|
||||
|
||||
|
@ -133,10 +135,8 @@ pub struct IndexingContext<
|
|||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
> where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
pub index: &'index Index,
|
||||
pub db_fields_ids_map: &'indexer FieldsIdsMap,
|
||||
|
@ -144,7 +144,7 @@ pub struct IndexingContext<
|
|||
pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>,
|
||||
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
|
||||
pub must_stop_processing: &'indexer MSP,
|
||||
pub send_progress: &'indexer SP,
|
||||
pub progress: &'indexer Progress,
|
||||
}
|
||||
|
||||
impl<
|
||||
|
@ -152,18 +152,15 @@ impl<
|
|||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
> Copy
|
||||
for IndexingContext<
|
||||
'fid, // invariant lifetime of fields ids map
|
||||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
}
|
||||
|
||||
|
@ -172,18 +169,15 @@ impl<
|
|||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
> Clone
|
||||
for IndexingContext<
|
||||
'fid, // invariant lifetime of fields ids map
|
||||
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
|
||||
'index, // covariant lifetime of the index
|
||||
MSP,
|
||||
SP,
|
||||
>
|
||||
where
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
*self
|
||||
|
@ -202,7 +196,6 @@ pub fn extract<
|
|||
EX,
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP,
|
||||
SP,
|
||||
>(
|
||||
document_changes: &DC,
|
||||
extractor: &EX,
|
||||
|
@ -213,18 +206,18 @@ pub fn extract<
|
|||
doc_allocs,
|
||||
fields_ids_map_store,
|
||||
must_stop_processing,
|
||||
send_progress,
|
||||
}: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
|
||||
progress,
|
||||
}: IndexingContext<'fid, 'indexer, 'index, MSP>,
|
||||
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
|
||||
datastore: &'data ThreadLocal<EX::Data>,
|
||||
step: Step,
|
||||
step: IndexingStep,
|
||||
) -> Result<()>
|
||||
where
|
||||
EX: Extractor<'extractor>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
tracing::trace!("We are resetting the extractor allocators");
|
||||
progress.update_progress(step);
|
||||
// Clean up and reuse the extractor allocs
|
||||
for extractor_alloc in extractor_allocs.iter_mut() {
|
||||
tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes());
|
||||
|
@ -232,9 +225,11 @@ where
|
|||
}
|
||||
|
||||
let total_documents = document_changes.len() as u32;
|
||||
let (step, progress_step) = AtomicDocumentStep::new(total_documents);
|
||||
progress.update_progress(progress_step);
|
||||
|
||||
let pi = document_changes.iter(CHUNK_SIZE);
|
||||
pi.enumerate().try_arc_for_each_try_init(
|
||||
pi.try_arc_for_each_try_init(
|
||||
|| {
|
||||
DocumentChangeContext::new(
|
||||
index,
|
||||
|
@ -247,13 +242,10 @@ where
|
|||
move |index_alloc| extractor.init_data(index_alloc),
|
||||
)
|
||||
},
|
||||
|context, (finished_documents, items)| {
|
||||
|context, items| {
|
||||
if (must_stop_processing)() {
|
||||
return Err(Arc::new(InternalError::AbortedIndexation.into()));
|
||||
}
|
||||
let finished_documents = (finished_documents * CHUNK_SIZE) as u32;
|
||||
|
||||
(send_progress)(Progress::from_step_substep(step, finished_documents, total_documents));
|
||||
|
||||
// Clean up and reuse the document-specific allocator
|
||||
context.doc_alloc.reset();
|
||||
|
@ -264,6 +256,7 @@ where
|
|||
});
|
||||
|
||||
let res = extractor.process(changes, context).map_err(Arc::new);
|
||||
step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
|
||||
|
||||
// send back the doc_alloc in the pool
|
||||
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
|
||||
|
@ -271,32 +264,7 @@ where
|
|||
res
|
||||
},
|
||||
)?;
|
||||
|
||||
(send_progress)(Progress::from_step_substep(step, total_documents, total_documents));
|
||||
step.store(total_documents, Ordering::Relaxed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct Progress {
|
||||
pub finished_steps: u16,
|
||||
pub total_steps: u16,
|
||||
pub step_name: &'static str,
|
||||
pub finished_total_substep: Option<(u32, u32)>,
|
||||
}
|
||||
|
||||
impl Progress {
|
||||
pub fn from_step(step: Step) -> Self {
|
||||
Self {
|
||||
finished_steps: step.finished_steps(),
|
||||
total_steps: Step::total_steps(),
|
||||
step_name: step.name(),
|
||||
finished_total_substep: None,
|
||||
}
|
||||
}
|
||||
pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self {
|
||||
Self {
|
||||
finished_total_substep: Some((finished_substep, total_substep)),
|
||||
..Progress::from_step(step)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -92,11 +92,12 @@ mod test {
|
|||
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::index::tests::TempIndex;
|
||||
use crate::progress::Progress;
|
||||
use crate::update::new::indexer::document_changes::{
|
||||
extract, DocumentChangeContext, Extractor, IndexingContext,
|
||||
};
|
||||
use crate::update::new::indexer::DocumentDeletion;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::{MostlySend, ThreadLocal};
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::DocumentId;
|
||||
|
@ -164,7 +165,7 @@ mod test {
|
|||
doc_allocs: &doc_allocs,
|
||||
fields_ids_map_store: &fields_ids_map_store,
|
||||
must_stop_processing: &(|| false),
|
||||
send_progress: &(|_progress| {}),
|
||||
progress: &Progress::default(),
|
||||
};
|
||||
|
||||
for _ in 0..3 {
|
||||
|
@ -176,7 +177,7 @@ mod test {
|
|||
context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
Step::ExtractingDocuments,
|
||||
IndexingStep::ExtractingDocuments,
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
use std::sync::atomic::Ordering;
|
||||
|
||||
use bumpalo::collections::CollectIn;
|
||||
use bumpalo::Bump;
|
||||
use bumparaw_collections::RawMap;
|
||||
|
@ -10,11 +12,12 @@ use serde_json::value::RawValue;
|
|||
use serde_json::Deserializer;
|
||||
|
||||
use super::super::document_change::DocumentChange;
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress};
|
||||
use super::document_changes::{DocumentChangeContext, DocumentChanges};
|
||||
use super::retrieve_or_guess_primary_key;
|
||||
use crate::documents::PrimaryKey;
|
||||
use crate::progress::{AtomicPayloadStep, Progress};
|
||||
use crate::update::new::document::Versions;
|
||||
use crate::update::new::steps::Step;
|
||||
use crate::update::new::steps::IndexingStep;
|
||||
use crate::update::new::thread_local::MostlySend;
|
||||
use crate::update::new::{Deletion, Insertion, Update};
|
||||
use crate::update::{AvailableIds, IndexDocumentsMethod};
|
||||
|
@ -45,7 +48,7 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
|
||||
pub fn into_changes<MSP, SP>(
|
||||
pub fn into_changes<MSP>(
|
||||
self,
|
||||
indexer: &'pl Bump,
|
||||
index: &Index,
|
||||
|
@ -53,12 +56,12 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
primary_key_from_op: Option<&'pl str>,
|
||||
new_fields_ids_map: &mut FieldsIdsMap,
|
||||
must_stop_processing: &MSP,
|
||||
send_progress: &SP,
|
||||
progress: Progress,
|
||||
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
|
||||
where
|
||||
MSP: Fn() -> bool,
|
||||
SP: Fn(Progress),
|
||||
{
|
||||
progress.update_progress(IndexingStep::PreparingPayloads);
|
||||
let Self { operations, method } = self;
|
||||
|
||||
let documents_ids = index.documents_ids(rtxn)?;
|
||||
|
@ -68,16 +71,14 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
let mut primary_key = None;
|
||||
|
||||
let payload_count = operations.len();
|
||||
let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32);
|
||||
progress.update_progress(progress_step);
|
||||
|
||||
for (payload_index, operation) in operations.into_iter().enumerate() {
|
||||
if must_stop_processing() {
|
||||
return Err(InternalError::AbortedIndexation.into());
|
||||
}
|
||||
send_progress(Progress::from_step_substep(
|
||||
Step::PreparingPayloads,
|
||||
payload_index as u32,
|
||||
payload_count as u32,
|
||||
));
|
||||
step.store(payload_index as u32, Ordering::Relaxed);
|
||||
|
||||
let mut bytes = 0;
|
||||
let result = match operation {
|
||||
|
@ -118,12 +119,7 @@ impl<'pl> DocumentOperation<'pl> {
|
|||
};
|
||||
operations_stats.push(PayloadStats { document_count, bytes, error });
|
||||
}
|
||||
|
||||
send_progress(Progress::from_step_substep(
|
||||
Step::PreparingPayloads,
|
||||
payload_count as u32,
|
||||
payload_count as u32,
|
||||
));
|
||||
step.store(payload_count as u32, Ordering::Relaxed);
|
||||
|
||||
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
|
||||
let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> =
|
||||
|
|
|
@ -5,7 +5,7 @@ use std::thread::{self, Builder};
|
|||
|
||||
use big_s::S;
|
||||
use bumparaw_collections::RawMap;
|
||||
use document_changes::{extract, DocumentChanges, IndexingContext, Progress};
|
||||
use document_changes::{extract, DocumentChanges, IndexingContext};
|
||||
pub use document_deletion::DocumentDeletion;
|
||||
pub use document_operation::{DocumentOperation, PayloadStats};
|
||||
use hashbrown::HashMap;
|
||||
|
@ -22,7 +22,7 @@ use super::channel::*;
|
|||
use super::extract::*;
|
||||
use super::facet_search_builder::FacetSearchBuilder;
|
||||
use super::merger::FacetFieldIdsDelta;
|
||||
use super::steps::Step;
|
||||
use super::steps::IndexingStep;
|
||||
use super::thread_local::ThreadLocal;
|
||||
use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
|
||||
use super::words_prefix_docids::{
|
||||
|
@ -33,6 +33,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
|
|||
use crate::facet::FacetType;
|
||||
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
|
||||
use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
|
||||
use crate::progress::Progress;
|
||||
use crate::proximity::ProximityPrecision;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::new::extract::EmbeddingExtractor;
|
||||
|
@ -60,7 +61,7 @@ mod update_by_function;
|
|||
///
|
||||
/// TODO return stats
|
||||
#[allow(clippy::too_many_arguments)] // clippy: 😝
|
||||
pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
|
||||
pub fn index<'pl, 'indexer, 'index, DC, MSP>(
|
||||
wtxn: &mut RwTxn,
|
||||
index: &'index Index,
|
||||
pool: &ThreadPoolNoAbort,
|
||||
|
@ -71,12 +72,11 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
|
|||
document_changes: &DC,
|
||||
embedders: EmbeddingConfigs,
|
||||
must_stop_processing: &'indexer MSP,
|
||||
send_progress: &'indexer SP,
|
||||
progress: &'indexer Progress,
|
||||
) -> Result<()>
|
||||
where
|
||||
DC: DocumentChanges<'pl>,
|
||||
MSP: Fn() -> bool + Sync,
|
||||
SP: Fn(Progress) + Sync,
|
||||
{
|
||||
let mut bbbuffers = Vec::new();
|
||||
let finished_extraction = AtomicBool::new(false);
|
||||
|
@ -125,7 +125,7 @@ where
|
|||
doc_allocs: &doc_allocs,
|
||||
fields_ids_map_store: &fields_ids_map_store,
|
||||
must_stop_processing,
|
||||
send_progress,
|
||||
progress,
|
||||
};
|
||||
|
||||
let mut index_embeddings = index.embedding_configs(wtxn)?;
|
||||
|
@ -159,7 +159,7 @@ where
|
|||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
Step::ExtractingDocuments,
|
||||
IndexingStep::ExtractingDocuments,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
|
@ -191,7 +191,7 @@ where
|
|||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&extractor_sender.field_id_docid_facet_sender(),
|
||||
Step::ExtractingFacets
|
||||
IndexingStep::ExtractingFacets
|
||||
)?
|
||||
};
|
||||
|
||||
|
@ -224,7 +224,7 @@ where
|
|||
document_changes,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
Step::ExtractingWords
|
||||
IndexingStep::ExtractingWords
|
||||
)?
|
||||
};
|
||||
|
||||
|
@ -302,7 +302,7 @@ where
|
|||
document_changes,
|
||||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
Step::ExtractingWordProximity,
|
||||
IndexingStep::ExtractingWordProximity,
|
||||
)?
|
||||
};
|
||||
|
||||
|
@ -338,7 +338,7 @@ where
|
|||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
Step::ExtractingEmbeddings,
|
||||
IndexingStep::ExtractingEmbeddings,
|
||||
)?;
|
||||
}
|
||||
{
|
||||
|
@ -371,7 +371,7 @@ where
|
|||
indexing_context,
|
||||
&mut extractor_allocs,
|
||||
&datastore,
|
||||
Step::WritingGeoPoints
|
||||
IndexingStep::WritingGeoPoints
|
||||
)?;
|
||||
}
|
||||
|
||||
|
@ -383,9 +383,7 @@ where
|
|||
&indexing_context.must_stop_processing,
|
||||
)?;
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase));
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::WritingToDatabase);
|
||||
finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed);
|
||||
|
||||
Result::Ok((facet_field_ids_delta, index_embeddings))
|
||||
|
@ -485,7 +483,7 @@ where
|
|||
)?;
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors));
|
||||
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
|
||||
|
||||
let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?;
|
||||
|
||||
|
@ -498,10 +496,7 @@ where
|
|||
break 'vectors;
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(
|
||||
Step::WritingEmbeddingsToDatabase,
|
||||
));
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase);
|
||||
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
|
||||
for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers {
|
||||
let dimensions = *dimensions;
|
||||
|
@ -517,21 +512,19 @@ where
|
|||
index.put_embedding_configs(wtxn, index_embeddings)?;
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets));
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
|
||||
if index.facet_search(wtxn)? {
|
||||
compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
|
||||
}
|
||||
|
||||
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords));
|
||||
|
||||
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
|
||||
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
|
||||
compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?;
|
||||
}
|
||||
|
||||
(indexing_context.send_progress)(Progress::from_step(Step::Finalizing));
|
||||
indexing_context.progress.update_progress(IndexingStep::Finalizing);
|
||||
|
||||
Ok(()) as Result<_>
|
||||
})?;
|
||||
|
|
|
@ -1,8 +1,12 @@
|
|||
use std::borrow::Cow;
|
||||
|
||||
use enum_iterator::Sequence;
|
||||
|
||||
use crate::progress::Step;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
|
||||
#[repr(u16)]
|
||||
pub enum Step {
|
||||
#[repr(u8)]
|
||||
pub enum IndexingStep {
|
||||
PreparingPayloads,
|
||||
ExtractingDocuments,
|
||||
ExtractingFacets,
|
||||
|
@ -18,30 +22,31 @@ pub enum Step {
|
|||
Finalizing,
|
||||
}
|
||||
|
||||
impl Step {
|
||||
pub fn name(&self) -> &'static str {
|
||||
impl Step for IndexingStep {
|
||||
fn name(&self) -> Cow<'static, str> {
|
||||
match self {
|
||||
Step::PreparingPayloads => "preparing update file",
|
||||
Step::ExtractingDocuments => "extracting documents",
|
||||
Step::ExtractingFacets => "extracting facets",
|
||||
Step::ExtractingWords => "extracting words",
|
||||
Step::ExtractingWordProximity => "extracting word proximity",
|
||||
Step::ExtractingEmbeddings => "extracting embeddings",
|
||||
Step::WritingGeoPoints => "writing geo points",
|
||||
Step::WritingToDatabase => "writing to database",
|
||||
Step::WaitingForExtractors => "waiting for extractors",
|
||||
Step::WritingEmbeddingsToDatabase => "writing embeddings to database",
|
||||
Step::PostProcessingFacets => "post-processing facets",
|
||||
Step::PostProcessingWords => "post-processing words",
|
||||
Step::Finalizing => "finalizing",
|
||||
IndexingStep::PreparingPayloads => "preparing update file",
|
||||
IndexingStep::ExtractingDocuments => "extracting documents",
|
||||
IndexingStep::ExtractingFacets => "extracting facets",
|
||||
IndexingStep::ExtractingWords => "extracting words",
|
||||
IndexingStep::ExtractingWordProximity => "extracting word proximity",
|
||||
IndexingStep::ExtractingEmbeddings => "extracting embeddings",
|
||||
IndexingStep::WritingGeoPoints => "writing geo points",
|
||||
IndexingStep::WritingToDatabase => "writing to database",
|
||||
IndexingStep::WaitingForExtractors => "waiting for extractors",
|
||||
IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database",
|
||||
IndexingStep::PostProcessingFacets => "post-processing facets",
|
||||
IndexingStep::PostProcessingWords => "post-processing words",
|
||||
IndexingStep::Finalizing => "finalizing",
|
||||
}
|
||||
.into()
|
||||
}
|
||||
|
||||
pub fn finished_steps(self) -> u16 {
|
||||
self as u16
|
||||
fn current(&self) -> u32 {
|
||||
*self as u32
|
||||
}
|
||||
|
||||
pub const fn total_steps() -> u16 {
|
||||
Self::CARDINALITY as u16
|
||||
fn total(&self) -> u32 {
|
||||
Self::CARDINALITY as u32
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue