5341: Embeddings stats r=ManyTheFish a=ManyTheFish

# Pull Request

## Related issue
Fixes #5321

## What does this PR do?
- Add embedding stats
- force dumpless upgrade to recompute stats
- add tests


Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
meili-bors[bot] 2025-02-12 15:46:37 +00:00 committed by GitHub
commit 885710a07b
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
12 changed files with 626 additions and 72 deletions

View file

@ -22,7 +22,7 @@ use crate::heed_codec::version::VersionCodec;
use crate::heed_codec::{BEU16StrCodec, FstSetCodec, StrBEU16Codec, StrRefCodec};
use crate::order_by_map::OrderByMap;
use crate::proximity::ProximityPrecision;
use crate::vector::{ArroyWrapper, Embedding, EmbeddingConfig};
use crate::vector::{ArroyStats, ArroyWrapper, Embedding, EmbeddingConfig};
use crate::{
default_criteria, CboRoaringBitmapCodec, Criterion, DocumentId, ExternalDocumentsIds,
FacetDistribution, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldIdWordCountCodec,
@ -1731,6 +1731,18 @@ impl Index {
let compute_prefixes = self.prefix_search(rtxn)?.unwrap_or_default();
Ok(PrefixSettings { compute_prefixes, max_prefix_length: 4, prefix_count_threshold: 100 })
}
pub fn arroy_stats(&self, rtxn: &RoTxn<'_>) -> Result<ArroyStats> {
let mut stats = ArroyStats::default();
let embedding_configs = self.embedding_configs(rtxn)?;
for config in embedding_configs {
let embedder_id = self.embedder_category_id.get(rtxn, &config.name)?.unwrap();
let reader =
ArroyWrapper::new(self.vector_arroy, embedder_id, config.config.quantized());
reader.aggregate_stats(rtxn, &mut stats)?;
}
Ok(stats)
}
}
#[derive(Debug, Deserialize, Serialize)]

View file

@ -1,7 +1,9 @@
mod v1_12;
mod v1_13;
use heed::RwTxn;
use v1_12::{V1_12_3_To_Current, V1_12_To_V1_12_3};
use v1_12::{V1_12_3_To_V1_13_0, V1_12_To_V1_12_3};
use v1_13::V1_13_0_To_Current;
use crate::progress::{Progress, VariableNameStep};
use crate::{Index, InternalError, Result};
@ -26,11 +28,13 @@ pub fn upgrade(
progress: Progress,
) -> Result<bool> {
let from = index.get_version(wtxn)?.unwrap_or(db_version);
let upgrade_functions: &[&dyn UpgradeIndex] = &[&V1_12_To_V1_12_3 {}, &V1_12_3_To_Current()];
let upgrade_functions: &[&dyn UpgradeIndex] =
&[&V1_12_To_V1_12_3 {}, &V1_12_3_To_V1_13_0 {}, &V1_13_0_To_Current()];
let start = match from {
(1, 12, 0..=2) => 0,
(1, 12, 3..) => 1,
(1, 13, 0) => 2,
// We must handle the current version in the match because in case of a failure some index may have been upgraded but not other.
(1, 13, _) => return Ok(false),
(major, minor, patch) => {

View file

@ -1,7 +1,6 @@
use heed::RwTxn;
use super::UpgradeIndex;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::progress::Progress;
use crate::{make_enum_progress, Index, Result};
@ -32,9 +31,9 @@ impl UpgradeIndex for V1_12_To_V1_12_3 {
}
#[allow(non_camel_case_types)]
pub(super) struct V1_12_3_To_Current();
pub(super) struct V1_12_3_To_V1_13_0 {}
impl UpgradeIndex for V1_12_3_To_Current {
impl UpgradeIndex for V1_12_3_To_V1_13_0 {
fn upgrade(
&self,
_wtxn: &mut RwTxn,
@ -42,14 +41,11 @@ impl UpgradeIndex for V1_12_3_To_Current {
_original: (u32, u32, u32),
_progress: Progress,
) -> Result<bool> {
Ok(false)
// recompute the indexes stats
Ok(true)
}
fn target_version(&self) -> (u32, u32, u32) {
(
VERSION_MAJOR.parse().unwrap(),
VERSION_MINOR.parse().unwrap(),
VERSION_PATCH.parse().unwrap(),
)
(1, 13, 0)
}
}

View file

@ -0,0 +1,29 @@
use heed::RwTxn;
use super::UpgradeIndex;
use crate::constants::{VERSION_MAJOR, VERSION_MINOR, VERSION_PATCH};
use crate::progress::Progress;
use crate::{Index, Result};
#[allow(non_camel_case_types)]
pub(super) struct V1_13_0_To_Current();
impl UpgradeIndex for V1_13_0_To_Current {
fn upgrade(
&self,
_wtxn: &mut RwTxn,
_index: &Index,
_original: (u32, u32, u32),
_progress: Progress,
) -> Result<bool> {
Ok(false)
}
fn target_version(&self) -> (u32, u32, u32) {
(
VERSION_MAJOR.parse().unwrap(),
VERSION_MINOR.parse().unwrap(),
VERSION_PATCH.parse().unwrap(),
)
}
}

View file

@ -410,8 +410,43 @@ impl ArroyWrapper {
fn quantized_db(&self) -> arroy::Database<BinaryQuantizedCosine> {
self.database.remap_data_type()
}
pub fn aggregate_stats(
&self,
rtxn: &RoTxn,
stats: &mut ArroyStats,
) -> Result<(), arroy::Error> {
if self.quantized {
for reader in self.readers(rtxn, self.quantized_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len();
}
} else {
for reader in self.readers(rtxn, self.angular_db()) {
let reader = reader?;
let documents = reader.item_ids();
if documents.is_empty() {
break;
}
stats.documents |= documents;
stats.number_of_embeddings += documents.len();
}
}
Ok(())
}
}
#[derive(Debug, Default, Clone)]
pub struct ArroyStats {
pub number_of_embeddings: u64,
pub documents: RoaringBitmap,
}
/// One or multiple embeddings stored consecutively in a flat vector.
pub struct Embeddings<F> {
data: Vec<F>,