Eagerly compute stats as fallback to the cache.

- Refactor all around to avoid spawning indexes more times than necessary
This commit is contained in:
Louis Dureuil 2023-02-28 15:24:31 +01:00 committed by Tamo
parent 3bbf760542
commit 076a3d371c
4 changed files with 83 additions and 21 deletions

View File

@ -847,8 +847,10 @@ impl IndexScheduler {
// this is a non-critical operation. If it fails, we should not fail
// the entire batch.
let res = || -> Result<()> {
let index_rtxn = index.read_txn()?;
let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?;
let mut wtxn = self.env.write_txn()?;
self.index_mapper.compute_and_store_stats_of(&mut wtxn, &index_uid)?;
self.index_mapper.store_stats_of(&mut wtxn, &index_uid, stats)?;
wtxn.commit()?;
Ok(())
}();
@ -888,6 +890,10 @@ impl IndexScheduler {
)?;
index_wtxn.commit()?;
}
// drop rtxn before starting a new wtxn on the same db
rtxn.commit()?;
task.status = Status::Succeeded;
task.details = Some(Details::IndexInfo { primary_key });
@ -897,7 +903,9 @@ impl IndexScheduler {
// the entire batch.
let res = || -> Result<()> {
let mut wtxn = self.env.write_txn()?;
self.index_mapper.compute_and_store_stats_of(&mut wtxn, &index_uid)?;
let index_rtxn = index.read_txn()?;
let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)?;
self.index_mapper.store_stats_of(&mut wtxn, &index_uid, stats)?;
wtxn.commit()?;
Ok(())
}();

View File

@ -54,8 +54,11 @@ pub struct IndexMapper {
/// Map an index name with an index uuid currently available on disk.
pub(crate) index_mapping: Database<Str, UuidCodec>,
/// Map an index name with the cached stats associated to the index.
pub(crate) index_stats: Database<Str, SerdeJson<IndexStats>>,
/// Map an index UUID with the cached stats associated to the index.
///
/// Using an UUID forces to use the index_mapping table to recover the index behind a name, ensuring
/// consistency wrt index swapping.
pub(crate) index_stats: Database<UuidCodec, SerdeJson<IndexStats>>,
/// Path to the folder where the LMDB environments of each index are.
base_path: PathBuf,
@ -80,15 +83,39 @@ pub enum IndexStatus {
Available(Index),
}
/// The statistics that can be computed from an `Index` object.
#[derive(Serialize, Deserialize, Debug)]
pub struct IndexStats {
/// Number of documents in the index.
pub number_of_documents: u64,
/// Size of the index' DB, in bytes.
pub database_size: u64,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
/// Creation date of the index.
pub created_at: OffsetDateTime,
/// Date of the last update of the index.
pub updated_at: OffsetDateTime,
}
impl IndexStats {
/// Compute the stats of an index
///
/// # Parameters
///
/// - rtxn: a RO transaction for the index, obtained from `Index::read_txn()`.
pub fn new(index: &Index, rtxn: &RoTxn) -> Result<Self> {
let database_size = index.on_disk_size()?;
Ok(IndexStats {
number_of_documents: index.number_of_documents(rtxn)?,
database_size,
field_distribution: index.field_distribution(rtxn)?,
created_at: index.created_at(rtxn)?,
updated_at: index.updated_at(rtxn)?,
})
}
}
impl IndexMapper {
pub fn new(
env: &Env,
@ -149,12 +176,14 @@ impl IndexMapper {
/// Removes the index from the mapping table and the in-memory index map
/// but keeps the associated tasks.
pub fn delete_index(&self, mut wtxn: RwTxn, name: &str) -> Result<()> {
self.index_stats.delete(&mut wtxn, name)?;
let uuid = self
.index_mapping
.get(&wtxn, name)?
.ok_or_else(|| Error::IndexNotFound(name.to_string()))?;
// Not an error if the index had no stats in cache.
self.index_stats.delete(&mut wtxn, &uuid)?;
// Once we retrieved the UUID of the index we remove it from the mapping table.
assert!(self.index_mapping.delete(&mut wtxn, name)?);
@ -375,26 +404,42 @@ impl IndexMapper {
Ok(())
}
/// Return the stored stats of an index.
/// The stats of an index.
///
/// If available in the cache, they are directly returned.
/// Otherwise, the `Index` is opened to compute the stats on the fly (the result is not cached).
/// The stats for an index are cached after each `Index` update.
pub fn stats_of(&self, rtxn: &RoTxn, index_uid: &str) -> Result<IndexStats> {
self.index_stats
let uuid = self
.index_mapping
.get(rtxn, index_uid)?
.ok_or_else(|| Error::IndexNotFound(index_uid.to_string()))
.ok_or_else(|| Error::IndexNotFound(index_uid.to_string()))?;
match self.index_stats.get(rtxn, &uuid)? {
Some(stats) => Ok(stats),
None => {
let index = self.index(rtxn, index_uid)?;
let index_rtxn = index.read_txn()?;
IndexStats::new(&index, &index_rtxn)
}
}
}
/// Return the stats of an index and write it in the index-mapper database.
pub fn compute_and_store_stats_of(&self, wtxn: &mut RwTxn, index_uid: &str) -> Result<()> {
let index = self.index(wtxn, index_uid)?;
let database_size = index.on_disk_size()?;
let rtxn = index.read_txn()?;
let stats = IndexStats {
number_of_documents: index.number_of_documents(&rtxn)?,
database_size,
field_distribution: index.field_distribution(&rtxn)?,
created_at: index.created_at(&rtxn)?,
updated_at: index.updated_at(&rtxn)?,
};
self.index_stats.put(wtxn, index_uid, &stats)?;
/// Stores the new stats for an index.
///
/// Expected usage is to compute the stats the index using `IndexStats::new`, the pass it to this function.
pub fn store_stats_of(
&self,
wtxn: &mut RwTxn,
index_uid: &str,
stats: IndexStats,
) -> Result<()> {
let uuid = self
.index_mapping
.get(wtxn, index_uid)?
.ok_or_else(|| Error::IndexNotFound(index_uid.to_string()))?;
self.index_stats.put(wtxn, &uuid, &stats)?;
Ok(())
}

View File

@ -1245,9 +1245,14 @@ struct IndexBudget {
task_db_size: usize,
}
/// The statistics that can be computed from an `Index` object and the scheduler.
///
/// Compared with `index_mapper::IndexStats`, it adds the scheduling status.
#[derive(Debug)]
pub struct IndexStats {
/// Whether this index is currently performing indexation, according to the scheduler.
pub is_indexing: bool,
/// Internal stats computed from the index.
pub inner_stats: index_mapper::IndexStats,
}

View File

@ -220,11 +220,15 @@ pub async fn delete_index(
Ok(HttpResponse::Accepted().json(task))
}
/// Stats of an `Index`, as known to the `stats` route.
#[derive(Serialize, Debug)]
#[serde(rename_all = "camelCase")]
pub struct IndexStats {
/// Number of documents in the index
pub number_of_documents: u64,
/// Whether the index is currently performing indexation, according to the scheduler.
pub is_indexing: bool,
/// Association of every field name with the number of times it occurs in the documents.
pub field_distribution: FieldDistribution,
}