5341: Embeddings stats r=ManyTheFish a=ManyTheFish

# Pull Request

## Related issue
Fixes #5321

## What does this PR do?
- Add embedding stats
- force dumpless upgrade to recompute stats
- add tests


Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2025-02-12 15:46:37 +00:00 committed by GitHub
commit 885710a07b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 626 additions and 72 deletions

View file

@ -410,8 +410,43 @@ impl ArroyWrapper {
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
self.database.remap_data_type()
}
pub fn aggregate_stats(
&self,
rtxn: &RoTxn,
stats: &mut ArroyStats,
) -> Result<(), arroy::Error> {
if self.quantized {
for reader in self.readers(rtxn, self.quantized_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len();
}
} else {
for reader in self.readers(rtxn, self.angular_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len();
}
}
Ok(())
}
}
#[derive(Debug, Default, Clone)]
pub struct ArroyStats {
pub number_of_embeddings: u64,
pub documents: RoaringBitmap,
}
/// One or multiple embeddings stored consecutively in a flat vector.
pub struct Embeddings<F> {
data: Vec<F>,