5092: Precise spans for new indexer r=dureuill a=dureuill

- Separate extract and merge spans
- Add span around commit

Co-authored-by: Louis Dureuil <louis@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2024-11-26 10:59:40 +00:00 committed by GitHub
commit fb66fec398
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 214 additions and 167 deletions

View File

@ -1024,7 +1024,13 @@ impl IndexScheduler {
let mut index_wtxn = index.write_txn()?;
let tasks = self.apply_index_operation(&mut index_wtxn, &index, op)?;
{
let span = tracing::trace_span!(target: "indexing::scheduler", "commit");
let _entered = span.enter();
index_wtxn.commit()?;
}
// if the update processed successfully, we're going to store the new
// stats of the index. Since the tasks have already been processed and

View File

@ -109,11 +109,14 @@ where
let rtxn = index.read_txn()?;
// document but we need to create a function that collects and compresses documents.
let document_sender = extractor_sender.documents();
let document_extractor = DocumentsExtractor::new(&document_sender, embedders);
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
{
let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "documents");
let _entered = span.enter();
extract(document_changes,
&document_extractor,
indexing_context,
@ -121,7 +124,10 @@ where
&datastore,
Step::ExtractingDocuments,
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "documents");
let _entered = span.enter();
for document_extractor_data in datastore {
let document_extractor_data = document_extractor_data.0.into_inner();
for (field, delta) in document_extractor_data.field_distribution_delta {
@ -133,14 +139,15 @@ where
}
field_distribution.retain(|_, v| *v != 0);
}
let facet_field_ids_delta;
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "faceted");
let caches = {
let span = tracing::trace_span!(target: "indexing::documents::extract", parent: &indexer_span, "faceted");
let _entered = span.enter();
facet_field_ids_delta = merge_and_send_facet_docids(
FacetedDocidsExtractor::run_extraction(
grenad_parameters,
document_changes,
@ -148,16 +155,25 @@ where
&mut extractor_allocs,
&extractor_sender.field_id_docid_facet_sender(),
Step::ExtractingFacets
)?,
)?
};
{
let span = tracing::trace_span!(target: "indexing::documents::merge", parent: &indexer_span, "faceted");
let _entered = span.enter();
facet_field_ids_delta = merge_and_send_facet_docids(
caches,
FacetDatabases::new(index),
index,
extractor_sender.facet_docids(),
)?;
}
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
let _entered = span.enter();
let WordDocidsCaches {
@ -166,15 +182,19 @@ where
exact_word_docids,
word_position_docids,
fid_word_count_docids,
} = WordDocidsExtractors::run_extraction(
} = {
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_docids");
let _entered = span.enter();
WordDocidsExtractors::run_extraction(
grenad_parameters,
document_changes,
indexing_context,
&mut extractor_allocs,
Step::ExtractingWords
)?;
)?
};
// TODO Word Docids Merger
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_docids");
let _entered = span.enter();
@ -187,7 +207,6 @@ where
)?;
}
// Word Fid Docids Merging
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_fid_docids");
let _entered = span.enter();
@ -200,7 +219,6 @@ where
)?;
}
// Exact Word Docids Merging
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "exact_word_docids");
let _entered = span.enter();
@ -213,7 +231,6 @@ where
)?;
}
// Word Position Docids Merging
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_position_docids");
let _entered = span.enter();
@ -226,7 +243,6 @@ where
)?;
}
// Fid Word Count Docids Merging
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "fid_word_count_docids");
let _entered = span.enter();
@ -244,17 +260,22 @@ where
// this works only if the settings didn't change during this transaction.
let proximity_precision = index.proximity_precision(&rtxn)?.unwrap_or_default();
if proximity_precision == ProximityPrecision::ByWord {
let caches = {
let span = tracing::trace_span!(target: "indexing::documents::extract", "word_pair_proximity_docids");
let _entered = span.enter();
let caches = <WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
<WordPairProximityDocidsExtractor as DocidsExtractor>::run_extraction(
grenad_parameters,
document_changes,
indexing_context,
&mut extractor_allocs,
Step::ExtractingWordProximity,
)?;
)?
};
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "word_pair_proximity_docids");
let _entered = span.enter();
merge_and_send_docids(
caches,
@ -264,10 +285,9 @@ where
&indexing_context.must_stop_processing,
)?;
}
}
'vectors: {
let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors");
let _entered = span.enter();
let mut index_embeddings = index.embedding_configs(&rtxn)?;
if index_embeddings.is_empty() {
@ -277,7 +297,15 @@ where
let embedding_sender = extractor_sender.embeddings();
let extractor = EmbeddingExtractor::new(embedders, &embedding_sender, field_distribution, request_threads());
let mut datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "vectors");
let _entered = span.enter();
extract(document_changes, &extractor, indexing_context, &mut extractor_allocs, &datastore, Step::ExtractingEmbeddings)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::merge", "vectors");
let _entered = span.enter();
for config in &mut index_embeddings {
'data: for data in datastore.iter_mut() {
@ -286,18 +314,21 @@ where
deladd.apply_to(&mut config.user_provided);
}
}
}
embedding_sender.finish(index_embeddings).unwrap();
}
'geo: {
let span = tracing::trace_span!(target: "indexing::documents::extract", "geo");
let _entered = span.enter();
let Some(extractor) = GeoExtractor::new(&rtxn, index, grenad_parameters)? else {
break 'geo;
};
let datastore = ThreadLocal::with_capacity(rayon::current_num_threads());
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "geo");
let _entered = span.enter();
extract(
document_changes,
&extractor,
@ -306,6 +337,7 @@ where
&datastore,
Step::WritingGeoPoints
)?;
}
merge_and_send_rtree(
datastore,
@ -316,11 +348,7 @@ where
)?;
}
{
let span = tracing::trace_span!(target: "indexing::documents::extract", "FINISH");
let _entered = span.enter();
(indexing_context.send_progress)(Progress::from_step(Step::WritingToDatabase));
}
Result::Ok(facet_field_ids_delta)
})?;
@ -352,6 +380,10 @@ where
.collect();
let mut arroy_writers = arroy_writers?;
{
let span = tracing::trace_span!(target: "indexing::write_db", "all");
let _entered = span.enter();
for operation in writer_receiver {
match operation {
WriterOperation::DbOperation(db_operation) => {
@ -362,11 +394,13 @@ where
Ok(false) => unreachable!("We tried to delete an unknown key"),
Ok(_) => (),
Err(error) => {
return Err(Error::InternalError(InternalError::StoreDeletion {
return Err(Error::InternalError(
InternalError::StoreDeletion {
database_name,
key: e.entry().to_owned(),
error,
}));
},
));
}
},
EntryOperation::Write(e) => {
@ -383,8 +417,10 @@ where
}
WriterOperation::ArroyOperation(arroy_operation) => match arroy_operation {
ArroyOperation::DeleteVectors { docid } => {
for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in
&mut arroy_writers
for (
_embedder_index,
(_embedder_name, _embedder, writer, dimensions),
) in &mut arroy_writers
{
let dimensions = *dimensions;
writer.del_items(wtxn, dimensions, docid)?;
@ -395,9 +431,10 @@ where
embedder_id,
embeddings: raw_embeddings,
} => {
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
// TODO: switch to Embeddings
let (_, _, writer, dimensions) = arroy_writers
.get(&embedder_id)
.expect("requested a missing embedder");
let mut embeddings = Embeddings::new(*dimensions);
for embedding in raw_embeddings {
embeddings.append(embedding).unwrap();
@ -407,8 +444,9 @@ where
writer.add_items(wtxn, docid, &embeddings)?;
}
ArroyOperation::SetVector { docid, embedder_id, embedding } => {
let (_, _, writer, dimensions) =
arroy_writers.get(&embedder_id).expect("requested a missing embedder");
let (_, _, writer, dimensions) = arroy_writers
.get(&embedder_id)
.expect("requested a missing embedder");
writer.del_items(wtxn, *dimensions, docid)?;
writer.add_item(wtxn, docid, &embedding)?;
}
@ -420,8 +458,10 @@ where
Step::WritingEmbeddingsToDatabase,
));
for (_embedder_index, (_embedder_name, _embedder, writer, dimensions)) in
&mut arroy_writers
for (
_embedder_index,
(_embedder_name, _embedder, writer, dimensions),
) in &mut arroy_writers
{
let dimensions = *dimensions;
writer.build_and_quantize(
@ -438,6 +478,7 @@ where
},
}
}
}
(indexing_context.send_progress)(Progress::from_step(Step::WaitingForExtractors));