From 68f4af7d2e7d0e5f44fccbda0872fa2f67967c4a Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 29 Sep 2020 15:10:08 +0200
Subject: [PATCH] Improve the display of the number of processed documents

---
 Cargo.lock         |  7 +++++++
 Cargo.toml         |  1 +
 src/bin/indexer.rs | 21 ++++++++++++++++-----
 3 files changed, 24 insertions(+), 5 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b7f33cd88..869eb47b4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -712,6 +712,12 @@ version = "1.3.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9"
 
+[[package]]
+name = "human_format"
+version = "1.0.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0"
+
 [[package]]
 name = "humansize"
 version = "1.1.0"
@@ -959,6 +965,7 @@ dependencies = [
  "fst",
  "fxhash",
  "heed",
+ "human_format",
  "itertools",
  "jemallocator",
  "levenshtein_automata",
diff --git a/Cargo.toml b/Cargo.toml
index 60777ee34..fb6231ba3 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -14,6 +14,7 @@ flate2 = "1.0.17"
 fst = "0.4.3"
 fxhash = "0.2.1"
 heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
+human_format = "1.0.3"
 jemallocator = "0.3.2"
 levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
 linked-hash-map = "0.5.3"
diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs
index c8c15fd9d..688d9f40b 100644
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@@ -26,7 +26,6 @@ use milli::tokenizer::{simple_tokenizer, only_token};
 use milli::{SmallVec32, Index, Position, DocumentId, BEU32};
 
 const LMDB_MAX_KEY_LENGTH: usize = 511;
-const ONE_MILLION: usize = 1_000_000;
 
 const MAX_POSITION: usize = 1000;
 const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
@@ -81,6 +80,11 @@ struct Opt {
 
 #[derive(Debug, StructOpt)]
 struct IndexerOpt {
+    /// The amount of documents to skip before printing
+    /// a log regarding the indexing advancement.
+    #[structopt(long, default_value = "1000000")] // 1m
+    log_every_n: usize,
+
     /// MTBL max number of chunks in bytes.
     #[structopt(long)]
     max_nb_chunks: Option<usize>,
@@ -117,6 +121,10 @@ fn compression_type_from_str(name: &str) -> CompressionType {
     }
 }
 
+fn format_count(n: usize) -> String {
+    human_format::Formatter::new().with_decimals(1).with_separator("").format(n as f64)
+}
+
 fn lmdb_key_valid_size(key: &[u8]) -> bool {
     !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH
 }
@@ -403,6 +411,7 @@ impl Store {
         mut rdr: csv::Reader<Box<dyn Read + Send>>,
         thread_index: usize,
         num_threads: usize,
+        log_every_n: usize,
     ) -> anyhow::Result<(Reader<Mmap>, Reader<Mmap>)>
     {
         debug!("{:?}: Indexing in a Store...", thread_index);
@@ -419,9 +428,10 @@ impl Store {
         while rdr.read_record(&mut document)? {
             // We skip documents that must not be indexed by this thread.
             if document_id % num_threads == thread_index {
-                if document_id % ONE_MILLION == 0 {
-                    let count = document_id / ONE_MILLION;
-                    info!("We have seen {}m documents so far ({:.02?}).", count, before.elapsed());
+                // This is a log routine that we do every `log_every_n` documents.
+                if document_id % log_every_n == 0 {
+                    let count = format_count(document_id);
+                    info!("We have seen {} documents so far ({:.02?}).", count, before.elapsed());
                     before = Instant::now();
                 }
 
@@ -657,6 +667,7 @@ fn main() -> anyhow::Result<()> {
     let max_memory = opt.indexer.max_memory;
     let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type);
     let chunk_compression_level = opt.indexer.chunk_compression_level;
+    let log_every_n = opt.indexer.log_every_n;
 
     let readers = csv_readers(opt.csv_file, num_threads)?
         .into_par_iter()
@@ -669,7 +680,7 @@ fn main() -> anyhow::Result<()> {
                 chunk_compression_type,
                 chunk_compression_level,
             )?;
-            store.index_csv(rdr, i, num_threads)
+            store.index_csv(rdr, i, num_threads, log_every_n)
         })
         .collect::<Result<Vec<_>, _>>()?;