diff --git a/Cargo.lock b/Cargo.lock index b7f33cd88..869eb47b4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -712,6 +712,12 @@ version = "1.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9" +[[package]] +name = "human_format" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0" + [[package]] name = "humansize" version = "1.1.0" @@ -959,6 +965,7 @@ dependencies = [ "fst", "fxhash", "heed", + "human_format", "itertools", "jemallocator", "levenshtein_automata", diff --git a/Cargo.toml b/Cargo.toml index 60777ee34..fb6231ba3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,6 +14,7 @@ flate2 = "1.0.17" fst = "0.4.3" fxhash = "0.2.1" heed = { version = "0.8.1", default-features = false, features = ["lmdb"] } +human_format = "1.0.3" jemallocator = "0.3.2" levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] } linked-hash-map = "0.5.3" diff --git a/src/bin/indexer.rs b/src/bin/indexer.rs index c8c15fd9d..688d9f40b 100644 --- a/src/bin/indexer.rs +++ b/src/bin/indexer.rs @@ -26,7 +26,6 @@ use milli::tokenizer::{simple_tokenizer, only_token}; use milli::{SmallVec32, Index, Position, DocumentId, BEU32}; const LMDB_MAX_KEY_LENGTH: usize = 511; -const ONE_MILLION: usize = 1_000_000; const MAX_POSITION: usize = 1000; const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION; @@ -81,6 +80,11 @@ struct Opt { #[derive(Debug, StructOpt)] struct IndexerOpt { + /// The amount of documents to skip before printing + /// a log regarding the indexing advancement. + #[structopt(long, default_value = "1000000")] // 1m + log_every_n: usize, + /// MTBL max number of chunks in bytes. #[structopt(long)] max_nb_chunks: Option, @@ -117,6 +121,10 @@ fn compression_type_from_str(name: &str) -> CompressionType { } } +fn format_count(n: usize) -> String { + human_format::Formatter::new().with_decimals(1).with_separator("").format(n as f64) +} + fn lmdb_key_valid_size(key: &[u8]) -> bool { !key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH } @@ -403,6 +411,7 @@ impl Store { mut rdr: csv::Reader>, thread_index: usize, num_threads: usize, + log_every_n: usize, ) -> anyhow::Result<(Reader, Reader)> { debug!("{:?}: Indexing in a Store...", thread_index); @@ -419,9 +428,10 @@ impl Store { while rdr.read_record(&mut document)? { // We skip documents that must not be indexed by this thread. if document_id % num_threads == thread_index { - if document_id % ONE_MILLION == 0 { - let count = document_id / ONE_MILLION; - info!("We have seen {}m documents so far ({:.02?}).", count, before.elapsed()); + // This is a log routine that we do every `log_every_n` documents. + if document_id % log_every_n == 0 { + let count = format_count(document_id); + info!("We have seen {} documents so far ({:.02?}).", count, before.elapsed()); before = Instant::now(); } @@ -657,6 +667,7 @@ fn main() -> anyhow::Result<()> { let max_memory = opt.indexer.max_memory; let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type); let chunk_compression_level = opt.indexer.chunk_compression_level; + let log_every_n = opt.indexer.log_every_n; let readers = csv_readers(opt.csv_file, num_threads)? .into_par_iter() @@ -669,7 +680,7 @@ fn main() -> anyhow::Result<()> { chunk_compression_type, chunk_compression_level, )?; - store.index_csv(rdr, i, num_threads) + store.index_csv(rdr, i, num_threads, log_every_n) }) .collect::, _>>()?;