mirror of
https://github.com/meilisearch/MeiliSearch
synced 2025-01-03 18:31:38 +01:00
Improve the display of the number of processed documents
This commit is contained in:
parent
59a127d022
commit
68f4af7d2e
7
Cargo.lock
generated
7
Cargo.lock
generated
@ -712,6 +712,12 @@ version = "1.3.4"
|
|||||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9"
|
checksum = "cd179ae861f0c2e53da70d892f5f3029f9594be0c41dc5269cd371691b1dc2f9"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "human_format"
|
||||||
|
version = "1.0.3"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "86cce260d758a9aa3d7c4b99d55c815a540f8a37514ba6046ab6be402a157cb0"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "humansize"
|
name = "humansize"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
@ -959,6 +965,7 @@ dependencies = [
|
|||||||
"fst",
|
"fst",
|
||||||
"fxhash",
|
"fxhash",
|
||||||
"heed",
|
"heed",
|
||||||
|
"human_format",
|
||||||
"itertools",
|
"itertools",
|
||||||
"jemallocator",
|
"jemallocator",
|
||||||
"levenshtein_automata",
|
"levenshtein_automata",
|
||||||
|
@ -14,6 +14,7 @@ flate2 = "1.0.17"
|
|||||||
fst = "0.4.3"
|
fst = "0.4.3"
|
||||||
fxhash = "0.2.1"
|
fxhash = "0.2.1"
|
||||||
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
|
heed = { version = "0.8.1", default-features = false, features = ["lmdb"] }
|
||||||
|
human_format = "1.0.3"
|
||||||
jemallocator = "0.3.2"
|
jemallocator = "0.3.2"
|
||||||
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
levenshtein_automata = { version = "0.2.0", features = ["fst_automaton"] }
|
||||||
linked-hash-map = "0.5.3"
|
linked-hash-map = "0.5.3"
|
||||||
|
@ -26,7 +26,6 @@ use milli::tokenizer::{simple_tokenizer, only_token};
|
|||||||
use milli::{SmallVec32, Index, Position, DocumentId, BEU32};
|
use milli::{SmallVec32, Index, Position, DocumentId, BEU32};
|
||||||
|
|
||||||
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
const LMDB_MAX_KEY_LENGTH: usize = 511;
|
||||||
const ONE_MILLION: usize = 1_000_000;
|
|
||||||
|
|
||||||
const MAX_POSITION: usize = 1000;
|
const MAX_POSITION: usize = 1000;
|
||||||
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
const MAX_ATTRIBUTES: usize = u32::max_value() as usize / MAX_POSITION;
|
||||||
@ -81,6 +80,11 @@ struct Opt {
|
|||||||
|
|
||||||
#[derive(Debug, StructOpt)]
|
#[derive(Debug, StructOpt)]
|
||||||
struct IndexerOpt {
|
struct IndexerOpt {
|
||||||
|
/// The amount of documents to skip before printing
|
||||||
|
/// a log regarding the indexing advancement.
|
||||||
|
#[structopt(long, default_value = "1000000")] // 1m
|
||||||
|
log_every_n: usize,
|
||||||
|
|
||||||
/// MTBL max number of chunks in bytes.
|
/// MTBL max number of chunks in bytes.
|
||||||
#[structopt(long)]
|
#[structopt(long)]
|
||||||
max_nb_chunks: Option<usize>,
|
max_nb_chunks: Option<usize>,
|
||||||
@ -117,6 +121,10 @@ fn compression_type_from_str(name: &str) -> CompressionType {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn format_count(n: usize) -> String {
|
||||||
|
human_format::Formatter::new().with_decimals(1).with_separator("").format(n as f64)
|
||||||
|
}
|
||||||
|
|
||||||
fn lmdb_key_valid_size(key: &[u8]) -> bool {
|
fn lmdb_key_valid_size(key: &[u8]) -> bool {
|
||||||
!key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH
|
!key.is_empty() && key.len() <= LMDB_MAX_KEY_LENGTH
|
||||||
}
|
}
|
||||||
@ -403,6 +411,7 @@ impl Store {
|
|||||||
mut rdr: csv::Reader<Box<dyn Read + Send>>,
|
mut rdr: csv::Reader<Box<dyn Read + Send>>,
|
||||||
thread_index: usize,
|
thread_index: usize,
|
||||||
num_threads: usize,
|
num_threads: usize,
|
||||||
|
log_every_n: usize,
|
||||||
) -> anyhow::Result<(Reader<Mmap>, Reader<Mmap>)>
|
) -> anyhow::Result<(Reader<Mmap>, Reader<Mmap>)>
|
||||||
{
|
{
|
||||||
debug!("{:?}: Indexing in a Store...", thread_index);
|
debug!("{:?}: Indexing in a Store...", thread_index);
|
||||||
@ -419,9 +428,10 @@ impl Store {
|
|||||||
while rdr.read_record(&mut document)? {
|
while rdr.read_record(&mut document)? {
|
||||||
// We skip documents that must not be indexed by this thread.
|
// We skip documents that must not be indexed by this thread.
|
||||||
if document_id % num_threads == thread_index {
|
if document_id % num_threads == thread_index {
|
||||||
if document_id % ONE_MILLION == 0 {
|
// This is a log routine that we do every `log_every_n` documents.
|
||||||
let count = document_id / ONE_MILLION;
|
if document_id % log_every_n == 0 {
|
||||||
info!("We have seen {}m documents so far ({:.02?}).", count, before.elapsed());
|
let count = format_count(document_id);
|
||||||
|
info!("We have seen {} documents so far ({:.02?}).", count, before.elapsed());
|
||||||
before = Instant::now();
|
before = Instant::now();
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -657,6 +667,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
let max_memory = opt.indexer.max_memory;
|
let max_memory = opt.indexer.max_memory;
|
||||||
let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type);
|
let chunk_compression_type = compression_type_from_str(&opt.indexer.chunk_compression_type);
|
||||||
let chunk_compression_level = opt.indexer.chunk_compression_level;
|
let chunk_compression_level = opt.indexer.chunk_compression_level;
|
||||||
|
let log_every_n = opt.indexer.log_every_n;
|
||||||
|
|
||||||
let readers = csv_readers(opt.csv_file, num_threads)?
|
let readers = csv_readers(opt.csv_file, num_threads)?
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
@ -669,7 +680,7 @@ fn main() -> anyhow::Result<()> {
|
|||||||
chunk_compression_type,
|
chunk_compression_type,
|
||||||
chunk_compression_level,
|
chunk_compression_level,
|
||||||
)?;
|
)?;
|
||||||
store.index_csv(rdr, i, num_threads)
|
store.index_csv(rdr, i, num_threads, log_every_n)
|
||||||
})
|
})
|
||||||
.collect::<Result<Vec<_>, _>>()?;
|
.collect::<Result<Vec<_>, _>>()?;
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user