Merge #5166

5166: fix list indexes r=dureuill a=irevoire # Pull Request ### Smol benchmark on a meilisearch with 1009 indexes: **Before** this PR on my computer, it was taking 5.5s to call the `GET /indexes` route on a cold computer where all the indexes were closed. **After** this PR it takes 0.009s to call the route on the first 20 indexes, and 0.176 for the last 20 indexes (retrieving the first or last indexes on main has no impact on performances). If my computations are right, that's between 61111.1% and 3125% faster on this test 😂 ## Related issue Fixes https://github.com/meilisearch/meilisearch/issues/4694 ## What does this PR do? - Add the primary key to the cache we already have in the index-mapper - Provide a new route to retrieve the paginated indexes straight from the cache without opening them - Fix a bug where the cache was not computed when loading a dump and was forcing us to open the indexes to compute their stats on the fly ## Is it breaking? Since the field I added is an `Option` I think we should consider it as non-breaking and let it update itself automatically on the next operation of this index. I also tested to run my patch over a DB generated on release-v1.12.0 and it works. The importing a dump also works. Co-authored-by: Tamo <tamo@meilisearch.com>
2025-07-03 20:07:09 +02:00 · 2024-12-31 10:55:22 +00:00 · 2024-12-31 10:55:22 +00:00 · baeefa4817
commit baeefa4817
parent 1a0d8810e5 e8ba7833ec
4 changed files with 76 additions and 16 deletions
--- a/crates/index-scheduler/src/index_mapper/mod.rs
+++ b/crates/index-scheduler/src/index_mapper/mod.rs
@ -106,6 +106,8 @@ pub struct IndexStats {
    /// As the DB backend does not return to the disk the pages that are not currently used by the DB,
    /// this value is typically smaller than `database_size`.
    pub used_database_size: u64,
+    /// The primary key of the index
+    pub primary_key: Option<String>,
    /// Association of every field name with the number of times it occurs in the documents.
    pub field_distribution: FieldDistribution,
    /// Creation date of the index.
@ -127,6 +129,7 @@ impl IndexStats {
            number_of_documents: index.number_of_documents(rtxn)?,
            database_size: index.on_disk_size()?,
            used_database_size: index.used_size()?,
+            primary_key: index.primary_key(rtxn)?.map(|s| s.to_string()),
            field_distribution: index.field_distribution(rtxn)?,
            created_at: index.created_at(rtxn)?,
            updated_at: index.updated_at(rtxn)?,
--- a/crates/index-scheduler/src/lib.rs
+++ b/crates/index-scheduler/src/lib.rs
@ -30,7 +30,7 @@ mod processing;
 mod utils;
 pub mod uuid_codec;

-pub type Result<T> = std::result::Result<T, Error>;
+pub type Result<T, E = Error> = std::result::Result<T, E>;
 pub type TaskId = u32;

 use std::collections::{BTreeMap, HashMap};
@ -1121,6 +1121,49 @@ impl IndexScheduler {
        Ok(batches)
    }

+    /// Returns the total number of indexes available for the specified filter.
+    /// And a `Vec` of the index_uid + its stats
+    pub fn get_paginated_indexes_stats(
+        &self,
+        filters: &meilisearch_auth::AuthFilter,
+        from: usize,
+        limit: usize,
+    ) -> Result<(usize, Vec<(String, index_mapper::IndexStats)>)> {
+        let rtxn = self.read_txn()?;
+
+        let mut total = 0;
+        let mut iter = self
+            .index_mapper
+            .index_mapping
+            .iter(&rtxn)?
+            // in case of an error we want to keep the value to return it
+            .filter(|ret| {
+                ret.as_ref().map_or(true, |(name, _uuid)| filters.is_index_authorized(name))
+            })
+            .inspect(|_| total += 1)
+            .skip(from);
+        let ret = iter
+            .by_ref()
+            .take(limit)
+            .map(|ret| ret.map_err(Error::from))
+            .map(|ret| {
+                ret.and_then(|(name, uuid)| {
+                    self.index_mapper.index_stats.get(&rtxn, &uuid).map_err(Error::from).and_then(
+                        |stat| {
+                            stat.map(|stat| (name.to_string(), stat))
+                                .ok_or(Error::CorruptedTaskQueue)
+                        },
+                    )
+                })
+            })
+            .collect::<Result<Vec<(String, index_mapper::IndexStats)>>>();
+
+        // We must iterate on the rest of the indexes to compute the total
+        iter.for_each(drop);
+
+        ret.map(|ret| (total, ret))
+    }
+
    /// The returned structure contains:
    /// 1. The name of the property being observed can be `statuses`, `types`, or `indexes`.
    /// 2. The name of the specific data related to the property can be `enqueued` for the `statuses`, `settingsUpdate` for the `types`, or the name of the index for the `indexes`, for example.
@ -1497,6 +1540,19 @@ impl IndexScheduler {
        Ok(index)
    }

+    pub fn refresh_index_stats(&self, name: &str) -> Result<()> {
+        let mut mapper_wtxn = self.env.write_txn()?;
+        let index = self.index_mapper.index(&mapper_wtxn, name)?;
+        let index_rtxn = index.read_txn()?;
+
+        let stats = crate::index_mapper::IndexStats::new(&index, &index_rtxn)
+            .map_err(|e| Error::from_milli(e, Some(name.to_string())))?;
+
+        self.index_mapper.store_stats_of(&mut mapper_wtxn, name, &stats)?;
+        mapper_wtxn.commit()?;
+        Ok(())
+    }
+
    /// Create a file and register it in the index scheduler.
    ///
    /// The returned file and uuid can be used to associate
--- a/crates/meilisearch/src/lib.rs
+++ b/crates/meilisearch/src/lib.rs
@ -435,7 +435,7 @@ fn import_dump(
        let reader = DocumentsBatchReader::from_reader(reader)?;

        let embedder_configs = index.embedding_configs(&wtxn)?;
-        let embedders = index_scheduler.embedders(uid, embedder_configs)?;
+        let embedders = index_scheduler.embedders(uid.to_string(), embedder_configs)?;

        let builder = milli::update::IndexDocuments::new(
            &mut wtxn,
@ -457,6 +457,8 @@ fn import_dump(
        builder.execute()?;
        wtxn.commit()?;
        tracing::info!("All documents successfully imported.");
+
+        index_scheduler.refresh_index_stats(&uid)?;
    }

    let mut index_scheduler_dump = index_scheduler.register_dumped_task()?;
--- a/crates/meilisearch/src/routes/indexes/mod.rs
+++ b/crates/meilisearch/src/routes/indexes/mod.rs
@ -5,7 +5,7 @@ use actix_web::web::Data;
 use actix_web::{web, HttpRequest, HttpResponse};
 use deserr::actix_web::{AwebJson, AwebQueryParameter};
 use deserr::{DeserializeError, Deserr, ValuePointerRef};
-use index_scheduler::{Error, IndexScheduler};
+use index_scheduler::IndexScheduler;
 use meilisearch_types::deserr::query_params::Param;
 use meilisearch_types::deserr::{immutable_field_error, DeserrJsonError, DeserrQueryParamError};
 use meilisearch_types::error::deserr_codes::*;
@ -104,19 +104,18 @@ pub async fn list_indexes(
 ) -> Result<HttpResponse, ResponseError> {
    debug!(parameters = ?paginate, "List indexes");
    let filters = index_scheduler.filters();
-    let indexes: Vec<Option<IndexView>> =
-        index_scheduler.try_for_each_index(|uid, index| -> Result<Option<IndexView>, _> {
-            if !filters.is_index_authorized(uid) {
-                return Ok(None);
-            }
-            Ok(Some(
-                IndexView::new(uid.to_string(), index)
-                    .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?,
-            ))
-        })?;
-    // Won't cause to open all indexes because IndexView doesn't keep the `Index` opened.
-    let indexes: Vec<IndexView> = indexes.into_iter().flatten().collect();
-    let ret = paginate.as_pagination().auto_paginate_sized(indexes.into_iter());
+    let (total, indexes) =
+        index_scheduler.get_paginated_indexes_stats(filters, *paginate.offset, *paginate.limit)?;
+    let indexes = indexes
+        .into_iter()
+        .map(|(name, stats)| IndexView {
+            uid: name,
+            created_at: stats.created_at,
+            updated_at: stats.updated_at,
+            primary_key: stats.primary_key,
+        })
+        .collect::<Vec<_>>();
+    let ret = paginate.as_pagination().format_with(total, indexes);

    debug!(returns = ?ret, "List indexes");
    Ok(HttpResponse::Ok().json(ret))