5147: Batch progress r=dureuill a=irevoire

# Pull Request

## Related issue
Fixes https://github.com/meilisearch/meilisearch/issues/5068

## What does this PR do?
- ...

## PR checklist
Please check if your PR fulfills the following requirements:
- [ ] Does this PR fix an existing issue, or have you listed the changes applied in the PR description (and why they are needed)?
- [ ] Have you read the contributing guidelines?
- [ ] Have you made sure that the title is accurate and descriptive of the changes?

Thank you so much for contributing to Meilisearch!


Co-authored-by: Tamo <tamo@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-12-12 09:15:54 +00:00 committed by GitHub
commit 1fc90fbacb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
38 changed files with 940 additions and 473 deletions

View file

@ -766,6 +766,7 @@ mod tests {
use crate::documents::mmap_from_objects;
use crate::index::tests::TempIndex;
use crate::index::IndexEmbeddingConfig;
use crate::progress::Progress;
use crate::search::TermsMatchingStrategy;
use crate::update::new::indexer;
use crate::update::Setting;
@ -1964,7 +1965,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2148,7 +2149,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2163,7 +2164,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2210,7 +2211,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2225,7 +2226,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2263,7 +2264,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2278,7 +2279,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2315,7 +2316,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2330,7 +2331,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2369,7 +2370,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2384,7 +2385,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2428,7 +2429,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2443,7 +2444,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2480,7 +2481,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2495,7 +2496,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2532,7 +2533,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2547,7 +2548,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2726,7 +2727,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2741,7 +2742,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2785,7 +2786,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2800,7 +2801,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();
@ -2841,7 +2842,7 @@ mod tests {
None,
&mut new_fields_ids_map,
&|| false,
&|_progress| (),
Progress::default(),
)
.unwrap();
@ -2856,7 +2857,7 @@ mod tests {
&document_changes,
embedders,
&|| false,
&|_| (),
&Progress::default(),
)
.unwrap();
wtxn.commit().unwrap();

View file

@ -16,10 +16,10 @@ use crate::update::del_add::DelAdd;
use crate::update::new::channel::FieldIdDocidFacetSender;
use crate::update::new::extract::perm_json_p;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
};
use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::Step;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
@ -373,26 +373,16 @@ fn truncate_str(s: &str) -> &str {
impl FacetedDocidsExtractor {
#[tracing::instrument(level = "trace", skip_all, target = "indexing::extract::faceted")]
pub fn run_extraction<
'pl,
'fid,
'indexer,
'index,
'extractor,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
sender: &FieldIdDocidFacetSender,
step: Step,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let index = indexing_context.index;
let rtxn = index.read_txn()?;

View file

@ -15,23 +15,22 @@ pub use geo::*;
pub use searchable::*;
pub use vectors::EmbeddingExtractor;
use super::indexer::document_changes::{DocumentChanges, IndexingContext, Progress};
use super::steps::Step;
use super::indexer::document_changes::{DocumentChanges, IndexingContext};
use super::steps::IndexingStep;
use super::thread_local::{FullySend, ThreadLocal};
use crate::update::GrenadParameters;
use crate::Result;
pub trait DocidsExtractor {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: Step,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync;
MSP: Fn() -> bool + Sync;
}
/// TODO move in permissive json pointer

View file

@ -11,10 +11,10 @@ use super::tokenize_document::{tokenizer_builder, DocumentTokenizer};
use crate::update::new::extract::cache::BalancedCaches;
use crate::update::new::extract::perm_json_p::contained_in;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
};
use crate::update::new::ref_cell_ext::RefCellExt as _;
use crate::update::new::steps::Step;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
@ -239,25 +239,15 @@ impl<'a, 'extractor> Extractor<'extractor> for WordDocidsExtractorData<'a> {
pub struct WordDocidsExtractors;
impl WordDocidsExtractors {
pub fn run_extraction<
'pl,
'fid,
'indexer,
'index,
'extractor,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
pub fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: Step,
step: IndexingStep,
) -> Result<WordDocidsCaches<'extractor>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let index = indexing_context.index;
let rtxn = index.read_txn()?;

View file

@ -14,9 +14,9 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer};
use super::cache::BalancedCaches;
use super::DocidsExtractor;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext, Progress,
extract, DocumentChangeContext, DocumentChanges, Extractor, IndexingContext,
};
use crate::update::new::steps::Step;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::update::GrenadParameters;
@ -56,16 +56,15 @@ impl<'a, 'extractor, EX: SearchableExtractor + Sync> Extractor<'extractor>
}
pub trait SearchableExtractor: Sized + Sync {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: Step,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let rtxn = indexing_context.index.read_txn()?;
let stop_words = indexing_context.index.stop_words(&rtxn)?;
@ -134,16 +133,15 @@ pub trait SearchableExtractor: Sized + Sync {
}
impl<T: SearchableExtractor> DocidsExtractor for T {
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP, SP>(
fn run_extraction<'pl, 'fid, 'indexer, 'index, 'extractor, DC: DocumentChanges<'pl>, MSP>(
grenad_parameters: GrenadParameters,
document_changes: &DC,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
indexing_context: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
step: Step,
step: IndexingStep,
) -> Result<Vec<BalancedCaches<'extractor>>>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
Self::run_extraction(
grenad_parameters,

View file

@ -1,4 +1,5 @@
use std::cell::{Cell, RefCell};
use std::sync::atomic::Ordering;
use std::sync::{Arc, RwLock};
use bumpalo::Bump;
@ -7,8 +8,9 @@ use rayon::iter::IndexedParallelIterator;
use super::super::document_change::DocumentChange;
use crate::fields_ids_map::metadata::FieldIdMapWithMetadata;
use crate::progress::{AtomicDocumentStep, Progress};
use crate::update::new::parallel_iterator_ext::ParallelIteratorExt as _;
use crate::update::new::steps::Step;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{FullySend, MostlySend, ThreadLocal};
use crate::{FieldsIdsMap, GlobalFieldsIdsMap, Index, InternalError, Result};
@ -133,10 +135,8 @@ pub struct IndexingContext<
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
> where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
pub index: &'index Index,
pub db_fields_ids_map: &'indexer FieldsIdsMap,
@ -144,7 +144,7 @@ pub struct IndexingContext<
pub doc_allocs: &'indexer ThreadLocal<FullySend<Cell<Bump>>>,
pub fields_ids_map_store: &'indexer ThreadLocal<FullySend<RefCell<GlobalFieldsIdsMap<'fid>>>>,
pub must_stop_processing: &'indexer MSP,
pub send_progress: &'indexer SP,
pub progress: &'indexer Progress,
}
impl<
@ -152,18 +152,15 @@ impl<
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
> Copy
for IndexingContext<
'fid, // invariant lifetime of fields ids map
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
}
@ -172,18 +169,15 @@ impl<
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
> Clone
for IndexingContext<
'fid, // invariant lifetime of fields ids map
'indexer, // covariant lifetime of objects that are borrowed during the entire indexing operation
'index, // covariant lifetime of the index
MSP,
SP,
>
where
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
fn clone(&self) -> Self {
*self
@ -202,7 +196,6 @@ pub fn extract<
EX,
DC: DocumentChanges<'pl>,
MSP,
SP,
>(
document_changes: &DC,
extractor: &EX,
@ -213,18 +206,18 @@ pub fn extract<
doc_allocs,
fields_ids_map_store,
must_stop_processing,
send_progress,
}: IndexingContext<'fid, 'indexer, 'index, MSP, SP>,
progress,
}: IndexingContext<'fid, 'indexer, 'index, MSP>,
extractor_allocs: &'extractor mut ThreadLocal<FullySend<Bump>>,
datastore: &'data ThreadLocal<EX::Data>,
step: Step,
step: IndexingStep,
) -> Result<()>
where
EX: Extractor<'extractor>,
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
tracing::trace!("We are resetting the extractor allocators");
progress.update_progress(step);
// Clean up and reuse the extractor allocs
for extractor_alloc in extractor_allocs.iter_mut() {
tracing::trace!("\tWith {} bytes reset", extractor_alloc.0.allocated_bytes());
@ -232,9 +225,11 @@ where
}
let total_documents = document_changes.len() as u32;
let (step, progress_step) = AtomicDocumentStep::new(total_documents);
progress.update_progress(progress_step);
let pi = document_changes.iter(CHUNK_SIZE);
pi.enumerate().try_arc_for_each_try_init(
pi.try_arc_for_each_try_init(
|| {
DocumentChangeContext::new(
index,
@ -247,13 +242,10 @@ where
move |index_alloc| extractor.init_data(index_alloc),
)
},
|context, (finished_documents, items)| {
|context, items| {
if (must_stop_processing)() {
return Err(Arc::new(InternalError::AbortedIndexation.into()));
}
let finished_documents = (finished_documents * CHUNK_SIZE) as u32;
(send_progress)(Progress::from_step_substep(step, finished_documents, total_documents));
// Clean up and reuse the document-specific allocator
context.doc_alloc.reset();
@ -264,6 +256,7 @@ where
});
let res = extractor.process(changes, context).map_err(Arc::new);
step.fetch_add(items.as_ref().len() as u32, Ordering::Relaxed);
// send back the doc_alloc in the pool
context.doc_allocs.get_or_default().0.set(std::mem::take(&mut context.doc_alloc));
@ -271,32 +264,7 @@ where
res
},
)?;
(send_progress)(Progress::from_step_substep(step, total_documents, total_documents));
step.store(total_documents, Ordering::Relaxed);
Ok(())
}
pub struct Progress {
pub finished_steps: u16,
pub total_steps: u16,
pub step_name: &'static str,
pub finished_total_substep: Option<(u32, u32)>,
}
impl Progress {
pub fn from_step(step: Step) -> Self {
Self {
finished_steps: step.finished_steps(),
total_steps: Step::total_steps(),
step_name: step.name(),
finished_total_substep: None,
}
}
pub fn from_step_substep(step: Step, finished_substep: u32, total_substep: u32) -> Self {
Self {
finished_total_substep: Some((finished_substep, total_substep)),
..Progress::from_step(step)
}
}
}

View file

@ -92,11 +92,12 @@ mod test {
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::index::tests::TempIndex;
use crate::progress::Progress;
use crate::update::new::indexer::document_changes::{
extract, DocumentChangeContext, Extractor, IndexingContext,
};
use crate::update::new::indexer::DocumentDeletion;
use crate::update::new::steps::Step;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::{MostlySend, ThreadLocal};
use crate::update::new::DocumentChange;
use crate::DocumentId;
@ -164,7 +165,7 @@ mod test {
doc_allocs: &doc_allocs,
fields_ids_map_store: &fields_ids_map_store,
must_stop_processing: &(|| false),
send_progress: &(|_progress| {}),
progress: &Progress::default(),
};
for _ in 0..3 {
@ -176,7 +177,7 @@ mod test {
context,
&mut extractor_allocs,
&datastore,
Step::ExtractingDocuments,
IndexingStep::ExtractingDocuments,
)
.unwrap();

View file

@ -1,3 +1,5 @@
use std::sync::atomic::Ordering;
use bumpalo::collections::CollectIn;
use bumpalo::Bump;
use bumparaw_collections::RawMap;
@ -10,11 +12,12 @@ use serde_json::value::RawValue;
use serde_json::Deserializer;
use super::super::document_change::DocumentChange;
use super::document_changes::{DocumentChangeContext, DocumentChanges, Progress};
use super::document_changes::{DocumentChangeContext, DocumentChanges};
use super::retrieve_or_guess_primary_key;
use crate::documents::PrimaryKey;
use crate::progress::{AtomicPayloadStep, Progress};
use crate::update::new::document::Versions;
use crate::update::new::steps::Step;
use crate::update::new::steps::IndexingStep;
use crate::update::new::thread_local::MostlySend;
use crate::update::new::{Deletion, Insertion, Update};
use crate::update::{AvailableIds, IndexDocumentsMethod};
@ -45,7 +48,7 @@ impl<'pl> DocumentOperation<'pl> {
#[allow(clippy::too_many_arguments)]
#[tracing::instrument(level = "trace", skip_all, target = "indexing::document_operation")]
pub fn into_changes<MSP, SP>(
pub fn into_changes<MSP>(
self,
indexer: &'pl Bump,
index: &Index,
@ -53,12 +56,12 @@ impl<'pl> DocumentOperation<'pl> {
primary_key_from_op: Option<&'pl str>,
new_fields_ids_map: &mut FieldsIdsMap,
must_stop_processing: &MSP,
send_progress: &SP,
progress: Progress,
) -> Result<(DocumentOperationChanges<'pl>, Vec<PayloadStats>, Option<PrimaryKey<'pl>>)>
where
MSP: Fn() -> bool,
SP: Fn(Progress),
{
progress.update_progress(IndexingStep::PreparingPayloads);
let Self { operations, method } = self;
let documents_ids = index.documents_ids(rtxn)?;
@ -68,16 +71,14 @@ impl<'pl> DocumentOperation<'pl> {
let mut primary_key = None;
let payload_count = operations.len();
let (step, progress_step) = AtomicPayloadStep::new(payload_count as u32);
progress.update_progress(progress_step);
for (payload_index, operation) in operations.into_iter().enumerate() {
if must_stop_processing() {
return Err(InternalError::AbortedIndexation.into());
}
send_progress(Progress::from_step_substep(
Step::PreparingPayloads,
payload_index as u32,
payload_count as u32,
));
step.store(payload_index as u32, Ordering::Relaxed);
let mut bytes = 0;
let result = match operation {
@ -118,12 +119,7 @@ impl<'pl> DocumentOperation<'pl> {
};
operations_stats.push(PayloadStats { document_count, bytes, error });
}
send_progress(Progress::from_step_substep(
Step::PreparingPayloads,
payload_count as u32,
payload_count as u32,
));
step.store(payload_count as u32, Ordering::Relaxed);
// TODO We must drain the HashMap into a Vec because rayon::hash_map::IntoIter: !Clone
let mut docids_version_offsets: bumpalo::collections::vec::Vec<_> =

View file

@ -5,7 +5,7 @@ use std::thread::{self, Builder};
use big_s::S;
use bumparaw_collections::RawMap;
use document_changes::{extract, DocumentChanges, IndexingContext, Progress};
use document_changes::{extract, DocumentChanges, IndexingContext};
pub use document_deletion::DocumentDeletion;
pub use document_operation::{DocumentOperation, PayloadStats};
use hashbrown::HashMap;
@ -22,7 +22,7 @@ use super::channel::*;
use super::extract::*;
use super::facet_search_builder::FacetSearchBuilder;
use super::merger::FacetFieldIdsDelta;
use super::steps::Step;
use super::steps::IndexingStep;
use super::thread_local::ThreadLocal;
use super::word_fst_builder::{PrefixData, PrefixDelta, WordFstBuilder};
use super::words_prefix_docids::{
@ -33,6 +33,7 @@ use crate::documents::{PrimaryKey, DEFAULT_PRIMARY_KEY};
use crate::facet::FacetType;
use crate::fields_ids_map::metadata::{FieldIdMapWithMetadata, MetadataBuilder};
use crate::index::main_key::{WORDS_FST_KEY, WORDS_PREFIXES_FST_KEY};
use crate::progress::Progress;
use crate::proximity::ProximityPrecision;
use crate::update::del_add::DelAdd;
use crate::update::new::extract::EmbeddingExtractor;
@ -60,7 +61,7 @@ mod update_by_function;
///
/// TODO return stats
#[allow(clippy::too_many_arguments)] // clippy: 😝
pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
pub fn index<'pl, 'indexer, 'index, DC, MSP>(
wtxn: &mut RwTxn,
index: &'index Index,
pool: &ThreadPoolNoAbort,
@ -71,12 +72,11 @@ pub fn index<'pl, 'indexer, 'index, DC, MSP, SP>(
document_changes: &DC,
embedders: EmbeddingConfigs,
must_stop_processing: &'indexer MSP,
send_progress: &'indexer SP,
progress: &'indexer Progress,
) -> Result<()>
where
DC: DocumentChanges<'pl>,
MSP: Fn() -> bool + Sync,
SP: Fn(Progress) + Sync,
{
let mut bbbuffers = Vec::new();
let finished_extraction = AtomicBool::new(false);
@ -125,7 +125,7 @@ where
doc_allocs: &doc_allocs,
fields_ids_map_store: &fields_ids_map_store,
must_stop_processing,
send_progress,
progress,
};
let mut index_embeddings = index.embedding_configs(wtxn)?;
@ -159,7 +159,7 @@ where
indexing_context,
&mut extractor_allocs,
&datastore,
Step::ExtractingDocuments,
IndexingStep::ExtractingDocuments,
)?;
}
{
@ -191,7 +191,7 @@ where
indexing_context,
&mut extractor_allocs,
&extractor_sender.field_id_docid_facet_sender(),
Step::ExtractingFacets
IndexingStep::ExtractingFacets
)?
};
@ -224,7 +224,7 @@ where
document_changes,
indexing_context,
&mut extractor_allocs,
Step::ExtractingWords
IndexingStep::ExtractingWords
)?
};
@ -302,7 +302,7 @@ where
document_changes,
indexing_context,
&mut extractor_allocs,
Step::ExtractingWordProximity,
IndexingStep::ExtractingWordProximity,
)?
};
@ -338,7 +338,7 @@ where
indexing_context,
&mut extractor_allocs,
&datastore,
Step::ExtractingEmbeddings,
IndexingStep::ExtractingEmbeddings,
)?;
}
{
@ -371,7 +371,7 @@ where
indexing_context,
&mut extractor_allocs,
&datastore,
Step::WritingGeoPoints
IndexingStep::WritingGeoPoints
)?;
}
@ -383,9 +383,7 @@ where
&indexing_context.must_stop_processing,
)?;
}
(indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase));
indexing_context.progress.update_progress(IndexingStep::WritingToDatabase);
finished_extraction.store(true, std::sync::atomic::Ordering::Relaxed);
Result::Ok((facet_field_ids_delta, index_embeddings))
@ -485,7 +483,7 @@ where
)?;
}
(indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors));
indexing_context.progress.update_progress(IndexingStep::WaitingForExtractors);
let (facet_field_ids_delta, index_embeddings) = extractor_handle.join().unwrap()?;
@ -498,10 +496,7 @@ where
break 'vectors;
}
(indexing_context.send_progress)(Progress::from_step(
Step::WritingEmbeddingsToDatabase,
));
indexing_context.progress.update_progress(IndexingStep::WritingEmbeddingsToDatabase);
let mut rng = rand::rngs::StdRng::seed_from_u64(42);
for (_index, (_embedder_name, _embedder, writer, dimensions)) in &mut arroy_writers {
let dimensions = *dimensions;
@ -517,21 +512,19 @@ where
index.put_embedding_configs(wtxn, index_embeddings)?;
}
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingFacets));
indexing_context.progress.update_progress(IndexingStep::PostProcessingFacets);
if index.facet_search(wtxn)? {
compute_facet_search_database(index, wtxn, global_fields_ids_map)?;
}
compute_facet_level_database(index, wtxn, facet_field_ids_delta)?;
(indexing_context.send_progress)(Progress::from_step(Step::PostProcessingWords));
indexing_context.progress.update_progress(IndexingStep::PostProcessingWords);
if let Some(prefix_delta) = compute_word_fst(index, wtxn)? {
compute_prefix_database(index, wtxn, prefix_delta, grenad_parameters)?;
}
(indexing_context.send_progress)(Progress::from_step(Step::Finalizing));
indexing_context.progress.update_progress(IndexingStep::Finalizing);
Ok(()) as Result<_>
})?;

View file

@ -1,8 +1,12 @@
use std::borrow::Cow;
use enum_iterator::Sequence;
use crate::progress::Step;
#[derive(Debug, Clone, Copy, PartialEq, Eq, Sequence)]
#[repr(u16)]
pub enum Step {
#[repr(u8)]
pub enum IndexingStep {
PreparingPayloads,
ExtractingDocuments,
ExtractingFacets,
@ -18,30 +22,31 @@ pub enum Step {
Finalizing,
}
impl Step {
pub fn name(&self) -> &'static str {
impl Step for IndexingStep {
fn name(&self) -> Cow<'static, str> {
match self {
Step::PreparingPayloads => "preparing update file",
Step::ExtractingDocuments => "extracting documents",
Step::ExtractingFacets => "extracting facets",
Step::ExtractingWords => "extracting words",
Step::ExtractingWordProximity => "extracting word proximity",
Step::ExtractingEmbeddings => "extracting embeddings",
Step::WritingGeoPoints => "writing geo points",
Step::WritingToDatabase => "writing to database",
Step::WaitingForExtractors => "waiting for extractors",
Step::WritingEmbeddingsToDatabase => "writing embeddings to database",
Step::PostProcessingFacets => "post-processing facets",
Step::PostProcessingWords => "post-processing words",
Step::Finalizing => "finalizing",
IndexingStep::PreparingPayloads => "preparing update file",
IndexingStep::ExtractingDocuments => "extracting documents",
IndexingStep::ExtractingFacets => "extracting facets",
IndexingStep::ExtractingWords => "extracting words",
IndexingStep::ExtractingWordProximity => "extracting word proximity",
IndexingStep::ExtractingEmbeddings => "extracting embeddings",
IndexingStep::WritingGeoPoints => "writing geo points",
IndexingStep::WritingToDatabase => "writing to database",
IndexingStep::WaitingForExtractors => "waiting for extractors",
IndexingStep::WritingEmbeddingsToDatabase => "writing embeddings to database",
IndexingStep::PostProcessingFacets => "post-processing facets",
IndexingStep::PostProcessingWords => "post-processing words",
IndexingStep::Finalizing => "finalizing",
}
.into()
}
pub fn finished_steps(self) -> u16 {
self as u16
fn current(&self) -> u32 {
*self as u32
}
pub const fn total_steps() -> u16 {
Self::CARDINALITY as u16
fn total(&self) -> u32 {
Self::CARDINALITY as u32
}
}