Implement Incremental document database stats computing

This commit is contained in:
ManyTheFish 2025-02-17 16:36:33 +01:00 committed by Kerollmops
parent d9642ec916
commit 9f3663e768
No known key found for this signature in database
GPG key ID: F250A4C4E3AE5F5F
9 changed files with 116 additions and 53 deletions

View file

@ -307,6 +307,7 @@ where
let current_span = tracing::Span::current();
// Run extraction pipeline in parallel.
let mut modified_docids = RoaringBitmap::new();
pool.install(|| {
let settings_diff_cloned = settings_diff.clone();
rayon::spawn(move || {
@ -367,7 +368,7 @@ where
Err(status) => {
if let Some(typed_chunks) = chunk_accumulator.pop_longest() {
let (docids, is_merged_database) =
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks)?;
write_typed_chunk_into_index(self.wtxn, self.index, &settings_diff, typed_chunks, &mut modified_docids)?;
if !docids.is_empty() {
final_documents_ids |= docids;
let documents_seen_count = final_documents_ids.len();
@ -467,6 +468,10 @@ where
Ok(())
}).map_err(InternalError::from)??;
if !settings_diff.settings_update_only {
// Update the stats of the documents database when there is a document update.
self.index.update_documents_stats(self.wtxn, modified_docids)?;
}
// We write the field distribution into the main database
self.index.put_field_distribution(self.wtxn, &field_distribution)?;

View file

@ -129,6 +129,7 @@ pub(crate) fn write_typed_chunk_into_index(
index: &Index,
settings_diff: &InnerIndexSettingsDiff,
typed_chunks: Vec<TypedChunk>,
modified_docids: &mut RoaringBitmap,
) -> Result<(RoaringBitmap, bool)> {
let mut is_merged_database = false;
match typed_chunks[0] {
@ -214,6 +215,7 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Create,
});
docids.insert(docid);
modified_docids.insert(docid);
} else {
db.delete(wtxn, &docid)?;
operations.push(DocumentOperation {
@ -222,6 +224,7 @@ pub(crate) fn write_typed_chunk_into_index(
kind: DocumentOperationKind::Delete,
});
docids.remove(docid);
modified_docids.insert(docid);
}
}
let external_documents_docids = index.external_documents_ids();

View file

@ -711,15 +711,17 @@ impl DelAddRoaringBitmap {
DelAddRoaringBitmap { del, add }
}
pub fn apply_to(&self, documents_ids: &mut RoaringBitmap) {
pub fn apply_to(&self, documents_ids: &mut RoaringBitmap, modified_docids: &mut RoaringBitmap) {
let DelAddRoaringBitmap { del, add } = self;
if let Some(del) = del {
*documents_ids -= del;
*modified_docids |= del;
}
if let Some(add) = add {
*documents_ids |= add;
*modified_docids |= add;
}
}
}

View file

@ -32,6 +32,7 @@ pub(super) fn extract_all<'pl, 'extractor, DC, MSP>(
field_distribution: &mut BTreeMap<String, u64>,
mut index_embeddings: Vec<IndexEmbeddingConfig>,
document_ids: &mut RoaringBitmap,
modified_docids: &mut RoaringBitmap,
) -> Result<(FacetFieldIdsDelta, Vec<IndexEmbeddingConfig>)>
where
DC: DocumentChanges<'pl>,
@ -70,7 +71,7 @@ where
// adding the delta should never cause a negative result, as we are removing fields that previously existed.
*current = current.saturating_add_signed(delta);
}
document_extractor_data.docids_delta.apply_to(document_ids);
document_extractor_data.docids_delta.apply_to(document_ids, modified_docids);
}
field_distribution.retain(|_, v| *v != 0);
@ -256,7 +257,7 @@ where
let Some(deladd) = data.remove(&config.name) else {
continue 'data;
};
deladd.apply_to(&mut config.user_provided);
deladd.apply_to(&mut config.user_provided, modified_docids);
}
}
}

View file

@ -130,6 +130,7 @@ where
let index_embeddings = index.embedding_configs(wtxn)?;
let mut field_distribution = index.field_distribution(wtxn)?;
let mut document_ids = index.documents_ids(wtxn)?;
let mut modified_docids = roaring::RoaringBitmap::new();
let congestion = thread::scope(|s| -> Result<ChannelCongestion> {
let indexer_span = tracing::Span::current();
@ -138,6 +139,7 @@ where
// prevent moving the field_distribution and document_ids in the inner closure...
let field_distribution = &mut field_distribution;
let document_ids = &mut document_ids;
let modified_docids = &mut modified_docids;
let extractor_handle =
Builder::new().name(S("indexer-extractors")).spawn_scoped(s, move || {
pool.install(move || {
@ -152,6 +154,7 @@ where
field_distribution,
index_embeddings,
document_ids,
modified_docids,
)
})
.unwrap()
@ -227,6 +230,7 @@ where
embedders,
field_distribution,
document_ids,
modified_docids,
)?;
Ok(congestion)

View file

@ -129,6 +129,7 @@ pub fn update_index(
embedders: EmbeddingConfigs,
field_distribution: std::collections::BTreeMap<String, u64>,
document_ids: roaring::RoaringBitmap,
modified_docids: roaring::RoaringBitmap,
) -> Result<()> {
index.put_fields_ids_map(wtxn, new_fields_ids_map.as_fields_ids_map())?;
if let Some(new_primary_key) = new_primary_key {
@ -140,6 +141,7 @@ pub fn update_index(
index.put_field_distribution(wtxn, &field_distribution)?;
index.put_documents_ids(wtxn, &document_ids)?;
index.set_updated_at(wtxn, &OffsetDateTime::now_utc())?;
index.update_documents_stats(wtxn, modified_docids)?;
Ok(())
}