Make sure the first document is associated to the document id 0

2025-07-04 20:37:15 +02:00 · 2020-08-29 10:56:40 +02:00 · 2020-08-29 10:56:40 +02:00 · 21aafd603c
commit 21aafd603c
parent 0a44ff86ab
1 changed files with 24 additions and 22 deletions
--- a/src/bin/indexer.rs
+++ b/src/bin/indexer.rs
@ -434,18 +434,16 @@ fn index_csv(
    let mut document_id: usize = 0;
    let mut document = csv::StringRecord::new();
    while rdr.read_record(&mut document)? {
-        document_id = document_id + 1;

-        // We skip documents that must not be indexed by this thread
-        if document_id % num_threads != thread_index { continue }
-
-        let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
-        if document_id % (ONE_MILLION as u32) == 0 {
+        // We skip documents that must not be indexed by this thread.
+        if document_id % num_threads == thread_index {
+            if document_id % ONE_MILLION == 0 {
                debug!("We have seen {}m documents so far ({:.02?}).",
-                document_id / ONE_MILLION as u32, before.elapsed());
+                    document_id / ONE_MILLION, before.elapsed());
                before = Instant::now();
            }

+            let document_id = DocumentId::try_from(document_id).context("generated id is too big")?;
            for (attr, content) in document.iter().enumerate().take(MAX_ATTRIBUTES) {
                for (pos, word) in lexer::break_string(&content).enumerate().take(MAX_POSITION) {
                    let word = word.cow_to_lowercase();
@ -462,6 +460,10 @@ fn index_csv(
            store.write_document(document_id, &document)?;
        }

+        // Compute the document id of the the next document.
+        document_id = document_id + 1;
+    }
+
    let (reader, docs_reader) = store.finish()?;
    debug!("{:?}: Store created!", thread_index);
    Ok((reader, docs_reader))