Merge pull request #35 from meilisearch/better-update-progress

Better update progress
2025-05-25 09:03:59 +02:00 · 2020-11-11 13:19:32 +01:00 · 2020-11-11 13:19:32 +01:00 · b4951c058b
commit b4951c058b
parent 8a4794fc51 a71a96894d
8 changed files with 243 additions and 59 deletions
--- a/http-ui/public/updates-script.js
+++ b/http-ui/public/updates-script.js
@ -54,13 +54,22 @@ $(window).on('load', function () {
      const content = $(`#${id} .updateStatus.content`);

      let html;
-      let { type, processed_number_of_documents, total_number_of_documents } = status.meta;
-      if (type === 'DocumentsAddition' && processed_number_of_documents && total_number_of_documents) {
-        let progress = Math.round(processed_number_of_documents / total_number_of_documents * 100);
+
+      let { type, step, total_steps, current, total } = status.meta;
+
+      if (type === 'DocumentsAddition') {
+        // If the total is null or undefined then the progress results is infinity.
+        let progress = Math.round(current / total * 100);
+        // We must divide the progress by the total number of indexing steps.
+        progress = progress / total_steps;
+        // And mark the previous steps as processed.
+        progress = progress + (step * 100 / total_steps);
+        // Generate the appropriate html bulma progress bar.
        html = `<progress class="progress" title="${progress}%" value="${progress}" max="100"></progress>`;
      } else {
        html = `<progress class="progress" max="100"></progress>`;
      }
+
      content.html(html);
    }

--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -26,6 +26,7 @@ use warp::filters::ws::Message;
 use warp::{Filter, http::Response};

 use milli::tokenizer::{simple_tokenizer, TokenType};
+use milli::update::UpdateIndexingStep::*;
 use milli::update::{UpdateBuilder, IndexDocumentsMethod, UpdateFormat};
 use milli::{obkv_to_json, Index, UpdateStore, SearchResult};

@ -201,8 +202,10 @@ enum UpdateMeta {
 #[serde(tag = "type")]
 enum UpdateMetaProgress {
    DocumentsAddition {
-        processed_number_of_documents: usize,
-        total_number_of_documents: Option<usize>,
+        step: usize,
+        total_steps: usize,
+        current: usize,
+        total: Option<usize>,
    },
 }

@ -310,12 +313,20 @@ async fn main() -> anyhow::Result<()> {
                        Box::new(content) as Box<dyn io::Read>
                    };

-                    let result = builder.execute(reader, |count, total| {
+                    let result = builder.execute(reader, |indexing_step| {
+                        let (current, total) = match indexing_step {
+                            TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
+                            ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+                            IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+                            MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
+                        };
                        let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
                            update_id,
                            meta: UpdateMetaProgress::DocumentsAddition {
-                                processed_number_of_documents: count,
-                                total_number_of_documents: Some(total),
+                                step: indexing_step.step(),
+                                total_steps: indexing_step.number_of_steps(),
+                                current,
+                                total,
                            }
                        });
                    });
@ -356,12 +367,20 @@ async fn main() -> anyhow::Result<()> {
                        }
                    }

-                    let result = builder.execute(|count, total| {
+                    let result = builder.execute(|indexing_step| {
+                        let (current, total) = match indexing_step {
+                            TransformFromUserIntoGenericFormat { documents_seen } => (documents_seen, None),
+                            ComputeIdsAndMergeDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+                            IndexDocuments { documents_seen, total_documents } => (documents_seen, Some(total_documents)),
+                            MergeDataIntoFinalDatabase { databases_seen, total_databases } => (databases_seen, Some(total_databases)),
+                        };
                        let _ = update_status_sender_cloned.send(UpdateStatus::Progressing {
                            update_id,
                            meta: UpdateMetaProgress::DocumentsAddition {
-                                processed_number_of_documents: count,
-                                total_number_of_documents: Some(total),
+                                step: indexing_step.step(),
+                                total_steps: indexing_step.number_of_steps(),
+                                current,
+                                total,
                            }
                        });
                    });
--- a/src/update/index_documents/mod.rs
+++ b/src/update/index_documents/mod.rs
@ -15,6 +15,7 @@ use rayon::prelude::*;
 use rayon::ThreadPool;

 use crate::index::Index;
+use crate::update::UpdateIndexingStep;
 use self::store::Store;
 use self::merge_function::{
    main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
@ -249,13 +250,14 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
    pub fn execute<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<()>
    where
        R: io::Read,
-        F: Fn(usize, usize) + Sync,
+        F: Fn(UpdateIndexingStep) + Sync,
    {
        let before_transform = Instant::now();

        let transform = Transform {
            rtxn: &self.wtxn,
            index: self.index,
+            log_every_n: self.log_every_n,
            chunk_compression_type: self.chunk_compression_type,
            chunk_compression_level: self.chunk_compression_level,
            chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
@ -266,9 +268,9 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
        };

        let output = match self.update_format {
-            UpdateFormat::Csv => transform.from_csv(reader)?,
-            UpdateFormat::Json => transform.from_json(reader)?,
-            UpdateFormat::JsonStream => transform.from_json_stream(reader)?,
+            UpdateFormat::Csv => transform.from_csv(reader, &progress_callback)?,
+            UpdateFormat::Json => transform.from_json(reader, &progress_callback)?,
+            UpdateFormat::JsonStream => transform.from_json_stream(reader, &progress_callback)?,
        };

        info!("Update transformed in {:.02?}", before_transform.elapsed());
@ -278,7 +280,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {

    pub fn execute_raw<F>(self, output: TransformOutput, progress_callback: F) -> anyhow::Result<()>
    where
-        F: Fn(usize, usize) + Sync
+        F: Fn(UpdateIndexingStep) + Sync
    {
        let before_indexing = Instant::now();

@ -460,6 +462,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
        documents_ids.union_with(&replaced_documents_ids);
        self.index.put_documents_ids(self.wtxn, &documents_ids)?;

+        let mut database_count = 0;
+        progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+            databases_seen: 0,
+            total_databases: 5,
+        });
+
        debug!("Writing the docid word positions into LMDB on disk...");
        merge_into_lmdb_database(
            self.wtxn,
@ -469,6 +477,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
            write_method
        )?;

+        database_count += 1;
+        progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+            databases_seen: database_count,
+            total_databases: 5,
+        });
+
        debug!("Writing the documents into LMDB on disk...");
        merge_into_lmdb_database(
            self.wtxn,
@ -478,6 +492,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
            write_method
        )?;

+        database_count += 1;
+        progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+            databases_seen: database_count,
+            total_databases: 5,
+        });
+
        debug!("Writing the words pairs proximities docids into LMDB on disk...");
        merge_into_lmdb_database(
            self.wtxn,
@ -487,6 +507,12 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
            write_method,
        )?;

+        database_count += 1;
+        progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+            databases_seen: database_count,
+            total_databases: 5,
+        });
+
        for (db_type, result) in receiver {
            let content = result?;
            match db_type {
@ -512,8 +538,16 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
                    )?;
                },
            }
+
+            database_count += 1;
+            progress_callback(UpdateIndexingStep::MergeDataIntoFinalDatabase {
+                databases_seen: database_count,
+                total_databases: 5,
+            });
        }

+        debug_assert_eq!(database_count, 5);
+
        info!("Transform output indexed in {:.02?}", before_indexing.elapsed());

        Ok(())
@ -537,7 +571,7 @@ mod tests {
        let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 3 documents now.
@ -551,7 +585,7 @@ mod tests {
        let content = &b"id,name\n1,updated kevin\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is **always** 3 documents.
@ -565,7 +599,7 @@ mod tests {
        let content = &b"id,name\n1,updated second kevin\n2,updated kevina\n3,updated benoit\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is **always** 3 documents.
@ -589,7 +623,7 @@ mod tests {
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
        builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is only 1 document now.
@ -616,7 +650,7 @@ mod tests {
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
        builder.index_documents_method(IndexDocumentsMethod::UpdateDocuments);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is **always** 1 document.
@ -652,7 +686,7 @@ mod tests {
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.disable_autogenerate_docids();
        builder.update_format(UpdateFormat::Csv);
-        assert!(builder.execute(content, |_, _| ()).is_err());
+        assert!(builder.execute(content, |_| ()).is_err());
        wtxn.commit().unwrap();

        // Check that there is no document.
@ -679,7 +713,7 @@ mod tests {
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.disable_autogenerate_docids();
        builder.update_format(UpdateFormat::Json);
-        assert!(builder.execute(content, |_, _| ()).is_err());
+        assert!(builder.execute(content, |_| ()).is_err());
        wtxn.commit().unwrap();

        // Check that there is no document.
@ -701,7 +735,7 @@ mod tests {
        let content = &b"name\nkevin\nkevina\nbenoit\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 3 documents now.
@ -719,7 +753,7 @@ mod tests {
        let content = format!("id,name\n{},updated kevin", kevin_uuid);
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content.as_bytes(), |_, _| ()).unwrap();
+        builder.execute(content.as_bytes(), |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is **always** 3 documents.
@ -754,7 +788,7 @@ mod tests {
        let content = &b"id,name\n1,kevin\n2,kevina\n3,benoit\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 3 documents now.
@ -768,7 +802,7 @@ mod tests {
        let content = &b"name\nnew kevin"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 4 documents now.
@ -790,7 +824,7 @@ mod tests {
        let content = &b"id,name\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is no documents.
@ -816,7 +850,7 @@ mod tests {
        ]"#[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Json);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 3 documents now.
@ -838,7 +872,7 @@ mod tests {
        let content = &b"[]"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Json);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is no documents.
@ -864,7 +898,7 @@ mod tests {
        "#[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::JsonStream);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 3 documents now.
@ -887,7 +921,7 @@ mod tests {
        let content = &b"id,name\nbrume bleue,kevin\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        assert!(builder.execute(content, |_, _| ()).is_err());
+        assert!(builder.execute(content, |_| ()).is_err());
        wtxn.commit().unwrap();

        // First we send 1 document with a valid id.
@ -896,7 +930,7 @@ mod tests {
        let content = &b"id,name\n32,kevin\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 1 document now.
@ -922,7 +956,7 @@ mod tests {
        ]"#[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Json);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that there is 1 documents now.
--- a/src/update/index_documents/store.rs
+++ b/src/update/index_documents/store.rs
@ -16,6 +16,7 @@ use tempfile::tempfile;

 use crate::heed_codec::{BoRoaringBitmapCodec, CboRoaringBitmapCodec};
 use crate::tokenizer::{simple_tokenizer, only_token};
+use crate::update::UpdateIndexingStep;
 use crate::{json_to_string, SmallVec32, Position, DocumentId};

 use super::{MergeFn, create_writer, create_sorter, writer_into_reader};
@ -294,7 +295,7 @@ impl Store {
        log_every_n: Option<usize>,
        mut progress_callback: F,
    ) -> anyhow::Result<Readers>
-    where F: FnMut(usize, usize),
+    where F: FnMut(UpdateIndexingStep),
    {
        debug!("{:?}: Indexing in a Store...", thread_index);

@ -311,7 +312,10 @@ impl Store {
                // This is a log routine that we do every `log_every_n` documents.
                if log_every_n.map_or(false, |len| count % len == 0) {
                    info!("We have seen {} documents so far ({:.02?}).", format_count(count), before.elapsed());
-                    progress_callback(count, documents_count);
+                    progress_callback(UpdateIndexingStep::IndexDocuments {
+                        documents_seen: count,
+                        total_documents: documents_count,
+                    });
                    before = Instant::now();
                }

@ -343,7 +347,10 @@ impl Store {
            count = count + 1;
        }

-        progress_callback(count, documents_count);
+        progress_callback(UpdateIndexingStep::IndexDocuments {
+            documents_seen: count,
+            total_documents: documents_count,
+        });

        let readers = self.finish()?;
        debug!("{:?}: Store created!", thread_index);
--- a/src/update/index_documents/transform.rs
+++ b/src/update/index_documents/transform.rs
@ -11,7 +11,7 @@ use roaring::RoaringBitmap;
 use serde_json::{Map, Value};

 use crate::{BEU32, MergeFn, Index, FieldsIdsMap};
-use crate::update::AvailableDocumentsIds;
+use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use super::merge_function::merge_two_obkvs;
 use super::{create_writer, create_sorter, IndexDocumentsMethod};

@ -34,6 +34,7 @@ pub struct TransformOutput {
 pub struct Transform<'t, 'i> {
    pub rtxn: &'t heed::RoTxn<'i>,
    pub index: &'i Index,
+    pub log_every_n: Option<usize>,
    pub chunk_compression_type: CompressionType,
    pub chunk_compression_level: Option<u32>,
    pub chunk_fusing_shrink_size: Option<u64>,
@ -44,15 +45,32 @@ pub struct Transform<'t, 'i> {
 }

 impl Transform<'_, '_> {
-    pub fn from_json<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> {
-        self.from_generic_json(reader, false)
+    pub fn from_json<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
+    where
+        R: Read,
+        F: Fn(UpdateIndexingStep) + Sync,
+    {
+        self.from_generic_json(reader, false, progress_callback)
    }

-    pub fn from_json_stream<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> {
-        self.from_generic_json(reader, true)
+    pub fn from_json_stream<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
+    where
+        R: Read,
+        F: Fn(UpdateIndexingStep) + Sync,
+    {
+        self.from_generic_json(reader, true, progress_callback)
    }

-    fn from_generic_json<R: Read>(self, reader: R, is_stream: bool) -> anyhow::Result<TransformOutput> {
+    fn from_generic_json<R, F>(
+        self,
+        reader: R,
+        is_stream: bool,
+        progress_callback: F,
+    ) -> anyhow::Result<TransformOutput>
+    where
+        R: Read,
+        F: Fn(UpdateIndexingStep) + Sync,
+    {
        let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
        let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();
        let primary_key = self.index.primary_key(self.rtxn)?;
@ -131,10 +149,17 @@ impl Transform<'_, '_> {
        let mut json_buffer = Vec::new();
        let mut obkv_buffer = Vec::new();
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
+        let mut documents_count = 0;

        for result in documents {
            let document = result?;

+            if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
+                progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+                    documents_seen: documents_count,
+                });
+            }
+
            obkv_buffer.clear();
            let mut writer = obkv::KvWriter::new(&mut obkv_buffer);

@ -186,14 +211,30 @@ impl Transform<'_, '_> {

            // We use the extracted/generated user id as the key for this document.
            sorter.insert(user_id.as_bytes(), &obkv_buffer)?;
+            documents_count += 1;
        }

+        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+            documents_seen: documents_count,
+        });
+
        // Now that we have a valid sorter that contains the user id and the obkv we
        // give it to the last transforming function which returns the TransformOutput.
-        self.from_sorter(sorter, primary_key, fields_ids_map, users_ids_documents_ids)
+        self.from_sorter(
+            sorter,
+            primary_key,
+            fields_ids_map,
+            documents_count,
+            users_ids_documents_ids,
+            progress_callback,
+        )
    }

-    pub fn from_csv<R: Read>(self, reader: R) -> anyhow::Result<TransformOutput> {
+    pub fn from_csv<R, F>(self, reader: R, progress_callback: F) -> anyhow::Result<TransformOutput>
+    where
+        R: Read,
+        F: Fn(UpdateIndexingStep) + Sync,
+    {
        let mut fields_ids_map = self.index.fields_ids_map(self.rtxn)?;
        let users_ids_documents_ids = self.index.users_ids_documents_ids(self.rtxn).unwrap();

@ -255,12 +296,19 @@ impl Transform<'_, '_> {
        let mut json_buffer = Vec::new();
        let mut obkv_buffer = Vec::new();
        let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
+        let mut documents_count = 0;

        let mut record = csv::StringRecord::new();
        while csv.read_record(&mut record)? {
            obkv_buffer.clear();
            let mut writer = obkv::KvWriter::new(&mut obkv_buffer);

+            if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
+                progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+                    documents_seen: documents_count,
+                });
+            }
+
            // We extract the user id if we know where it is or generate an UUID V4 otherwise.
            let user_id = match user_id_pos {
                Some(pos) => {
@ -292,23 +340,39 @@ impl Transform<'_, '_> {

            // We use the extracted/generated user id as the key for this document.
            sorter.insert(user_id, &obkv_buffer)?;
+            documents_count += 1;
        }

+        progress_callback(UpdateIndexingStep::TransformFromUserIntoGenericFormat {
+            documents_seen: documents_count,
+        });
+
        // Now that we have a valid sorter that contains the user id and the obkv we
        // give it to the last transforming function which returns the TransformOutput.
-        self.from_sorter(sorter, primary_key_field_id, fields_ids_map, users_ids_documents_ids)
+        self.from_sorter(
+            sorter,
+            primary_key_field_id,
+            fields_ids_map,
+            documents_count,
+            users_ids_documents_ids,
+            progress_callback,
+        )
    }

    /// Generate the `TransformOutput` based on the given sorter that can be generated from any
    /// format like CSV, JSON or JSON stream. This sorter must contain a key that is the document
    /// id for the user side and the value must be an obkv where keys are valid fields ids.
-    fn from_sorter(
+    fn from_sorter<F>(
        self,
        sorter: grenad::Sorter<MergeFn>,
        primary_key: u8,
        fields_ids_map: FieldsIdsMap,
+        approximate_number_of_documents: usize,
        users_ids_documents_ids: fst::Map<Cow<'_, [u8]>>,
+        progress_callback: F,
    ) -> anyhow::Result<TransformOutput>
+    where
+        F: Fn(UpdateIndexingStep) + Sync,
    {
        let documents_ids = self.index.documents_ids(self.rtxn)?;
        let mut available_documents_ids = AvailableDocumentsIds::from_documents_ids(&documents_ids);
@ -332,6 +396,13 @@ impl Transform<'_, '_> {
        let mut iter = sorter.into_iter()?;
        while let Some((user_id, update_obkv)) = iter.next()? {

+            if self.log_every_n.map_or(false, |len| documents_count % len == 0) {
+                progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
+                    documents_seen: documents_count,
+                    total_documents: approximate_number_of_documents,
+                });
+            }
+
            let (docid, obkv) = match users_ids_documents_ids.get(user_id) {
                Some(docid) => {
                    // If we find the user id in the current users ids documents ids map
@ -369,6 +440,11 @@ impl Transform<'_, '_> {
            documents_count += 1;
        }

+        progress_callback(UpdateIndexingStep::ComputeIdsAndMergeDocuments {
+            documents_seen: documents_count,
+            total_documents: documents_count,
+        });
+
        // We create a final writer to write the new documents in order from the sorter.
        let file = tempfile::tempfile()?;
        let mut writer = create_writer(self.chunk_compression_type, self.chunk_compression_level, file)?;
--- a/src/update/mod.rs
+++ b/src/update/mod.rs
@ -4,6 +4,7 @@ mod delete_documents;
 mod index_documents;
 mod settings;
 mod update_builder;
+mod update_step;
 mod update_store;

 pub use self::available_documents_ids::AvailableDocumentsIds;
@ -12,4 +13,5 @@ pub use self::delete_documents::DeleteDocuments;
 pub use self::index_documents::{IndexDocuments, IndexDocumentsMethod, UpdateFormat};
 pub use self::settings::Settings;
 pub use self::update_builder::UpdateBuilder;
+pub use self::update_step::UpdateIndexingStep;
 pub use self::update_store::UpdateStore;
--- a/src/update/settings.rs
+++ b/src/update/settings.rs
@ -3,7 +3,7 @@ use grenad::CompressionType;
 use rayon::ThreadPool;

 use crate::update::index_documents::{Transform, IndexDocumentsMethod};
-use crate::update::{ClearDocuments, IndexDocuments};
+use crate::update::{ClearDocuments, IndexDocuments, UpdateIndexingStep};
 use crate::{Index, FieldsIdsMap};

 pub struct Settings<'a, 't, 'u, 'i> {
@ -60,7 +60,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {

    pub fn execute<F>(self, progress_callback: F) -> anyhow::Result<()>
    where
-        F: Fn(usize, usize) + Sync
+        F: Fn(UpdateIndexingStep) + Sync
    {
        // Check that the searchable attributes have been specified.
        if let Some(value) = self.searchable_fields {
@ -126,6 +126,7 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
            let transform = Transform {
                rtxn: &self.wtxn,
                index: self.index,
+                log_every_n: self.log_every_n,
                chunk_compression_type: self.chunk_compression_type,
                chunk_compression_level: self.chunk_compression_level,
                chunk_fusing_shrink_size: self.chunk_fusing_shrink_size,
@ -231,14 +232,14 @@ mod tests {
        let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // We change the searchable fields to be the "name" field only.
        let mut wtxn = index.write_txn().unwrap();
        let mut builder = Settings::new(&mut wtxn, &index);
        builder.set_searchable_fields(vec!["name".into()]);
-        builder.execute(|_, _| ()).unwrap();
+        builder.execute(|_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that the searchable field is correctly set to "name" only.
@ -260,7 +261,7 @@ mod tests {
        let mut wtxn = index.write_txn().unwrap();
        let mut builder = Settings::new(&mut wtxn, &index);
        builder.reset_searchable_fields();
-        builder.execute(|_, _| ()).unwrap();
+        builder.execute(|_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that the searchable field have been reset and documents are found now.
@ -286,7 +287,7 @@ mod tests {
        let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // In the same transaction we change the displayed fields to be only the "age".
@ -295,7 +296,7 @@ mod tests {
        let mut builder = Settings::new(&mut wtxn, &index);
        builder.set_displayed_fields(vec!["age".into()]);
        builder.set_searchable_fields(vec!["name".into()]);
-        builder.execute(|_, _| ()).unwrap();
+        builder.execute(|_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that the displayed fields are correctly set to `None` (default value).
@ -310,7 +311,7 @@ mod tests {
        let mut wtxn = index.write_txn().unwrap();
        let mut builder = Settings::new(&mut wtxn, &index);
        builder.reset_searchable_fields();
-        builder.execute(|_, _| ()).unwrap();
+        builder.execute(|_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that the displayed fields always contains only the "age" field.
@ -334,7 +335,7 @@ mod tests {
        let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that the displayed fields are correctly set to `None` (default value).
@ -356,12 +357,12 @@ mod tests {
        let content = &b"name,age\nkevin,23\nkevina,21\nbenoit,34\n"[..];
        let mut builder = IndexDocuments::new(&mut wtxn, &index);
        builder.update_format(UpdateFormat::Csv);
-        builder.execute(content, |_, _| ()).unwrap();
+        builder.execute(content, |_| ()).unwrap();

        // In the same transaction we change the displayed fields to be only the age.
        let mut builder = Settings::new(&mut wtxn, &index);
        builder.set_displayed_fields(vec!["age".into()]);
-        builder.execute(|_, _| ()).unwrap();
+        builder.execute(|_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that the displayed fields are correctly set to only the "age" field.
@ -376,7 +377,7 @@ mod tests {
        let mut wtxn = index.write_txn().unwrap();
        let mut builder = Settings::new(&mut wtxn, &index);
        builder.reset_displayed_fields();
-        builder.execute(|_, _| ()).unwrap();
+        builder.execute(|_| ()).unwrap();
        wtxn.commit().unwrap();

        // Check that the displayed fields are correctly set to `None` (default value).
--- a/src/update/update_step.rs
+++ b/src/update/update_step.rs
@ -0,0 +1,36 @@
+use UpdateIndexingStep::*;
+
+#[derive(Debug, Clone, Copy)]
+pub enum UpdateIndexingStep {
+    /// Transform from the original user given format (CSV, JSON, JSON lines)
+    /// into a generic format based on the obkv and grenad crates. This step also
+    /// deduplicate potential documents in this batch update by merging or replacing them.
+    TransformFromUserIntoGenericFormat { documents_seen: usize },
+
+    /// This step check the external document id, computes the internal ids and merge
+    /// the documents that are already present in the database.
+    ComputeIdsAndMergeDocuments { documents_seen: usize, total_documents: usize },
+
+    /// Extract the documents words using the tokenizer and compute the documents
+    /// facets. Stores those words, facets and documents ids on disk.
+    IndexDocuments { documents_seen: usize, total_documents: usize },
+
+    /// Merge the previously extracted data (words and facets) into the final LMDB database.
+    /// These extracted data are split into multiple databases.
+    MergeDataIntoFinalDatabase { databases_seen: usize, total_databases: usize },
+}
+
+impl UpdateIndexingStep {
+    pub const fn step(&self) -> usize {
+        match self {
+            TransformFromUserIntoGenericFormat { .. } => 0,
+            ComputeIdsAndMergeDocuments { .. } => 1,
+            IndexDocuments { .. } => 2,
+            MergeDataIntoFinalDatabase { .. } => 3,
+        }
+    }
+
+    pub const fn number_of_steps(&self) -> usize {
+        4
+    }
+}