Parallelize document upload

2025-06-20 21:48:29 +02:00 · 2025-06-16 16:30:35 +02:00 · 2025-06-16 16:30:35 +02:00 · b21c983b0a
commit b21c983b0a
parent 04ee7b0863
6 changed files with 133 additions and 82 deletions
--- a/crates/index-scheduler/src/scheduler/process_export.rs
+++ b/crates/index-scheduler/src/scheduler/process_export.rs
@ -7,9 +7,9 @@ use backoff::ExponentialBackoff;
 use meilisearch_types::index_uid_pattern::IndexUidPattern;
 use meilisearch_types::milli::constants::RESERVED_VECTORS_FIELD_NAME;
 use meilisearch_types::milli::progress::{Progress, VariableNameStep};
-use meilisearch_types::milli::update::Setting;
+use meilisearch_types::milli::update::{request_threads, Setting};
 use meilisearch_types::milli::vector::parsed_vectors::{ExplicitVectors, VectorOrArrayOfVectors};
-use meilisearch_types::milli::{obkv_to_json, Filter};
+use meilisearch_types::milli::{self, obkv_to_json, Filter, InternalError};
 use meilisearch_types::settings::{self, SecretPolicy};
 use meilisearch_types::tasks::ExportIndexSettings;
 use serde::Deserialize;
@ -112,6 +112,10 @@ impl IndexScheduler {
                .embedding_configs(&index_rtxn)
                .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;

+            // We don't need to keep this one alive as we will
+            // spawn many threads to process the documents
+            drop(index_rtxn);
+
            let total_documents = universe.len() as u32;
            let (step, progress_step) = AtomicDocumentStep::new(total_documents);
            progress.update_progress(progress_step);
@ -119,9 +123,19 @@ impl IndexScheduler {
            let limit = 50 * 1024 * 1024; // 50 MiB
            let documents_url = format!("{base_url}/indexes/{uid}/documents");

+            request_threads()
+                .broadcast(|ctx| {
+                    let index_rtxn = index
+                        .read_txn()
+                        .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?;
+
                    let mut buffer = Vec::new();
                    let mut tmp_buffer = Vec::new();
-            for (i, docid) in universe.into_iter().enumerate() {
+                    for (i, docid) in universe.iter().enumerate() {
+                        if i % ctx.num_threads() != ctx.index() {
+                            continue;
+                        }
+
                        let document = index
                            .document(&index_rtxn, docid)
                            .map_err(|e| Error::from_milli(e, Some(uid.to_string())))?;
@ -145,11 +159,14 @@ impl IndexScheduler {

                            let serde_json::Value::Object(vectors) = vectors else {
                                return Err(Error::from_milli(
-                            meilisearch_types::milli::Error::UserError(
-                                meilisearch_types::milli::UserError::InvalidVectorsMapType {
+                                    milli::Error::UserError(
+                                        milli::UserError::InvalidVectorsMapType {
                                            document_id: {
                                                if let Ok(Some(Ok(index))) = index
-                                            .external_id_of(&index_rtxn, std::iter::once(docid))
+                                                    .external_id_of(
+                                                        &index_rtxn,
+                                                        std::iter::once(docid),
+                                                    )
                                                    .map(|it| it.into_iter().next())
                                                {
                                                    index
@ -171,18 +188,21 @@ impl IndexScheduler {
                                    .is_some_and(|conf| conf.user_provided.contains(docid));

                                let embeddings = ExplicitVectors {
-                            embeddings: Some(VectorOrArrayOfVectors::from_array_of_vectors(
-                                embeddings,
-                            )),
+                                    embeddings: Some(
+                                        VectorOrArrayOfVectors::from_array_of_vectors(embeddings),
+                                    ),
                                    regenerate: !user_provided,
                                };
-                        vectors.insert(embedder_name, serde_json::to_value(embeddings).unwrap());
+                                vectors.insert(
+                                    embedder_name,
+                                    serde_json::to_value(embeddings).unwrap(),
+                                );
                            }
                        }

                        tmp_buffer.clear();
                        serde_json::to_writer(&mut tmp_buffer, &document)
-                    .map_err(meilisearch_types::milli::InternalError::from)
+                            .map_err(milli::InternalError::from)
                            .map_err(|e| Error::from_milli(e.into(), Some(uid.to_string())))?;

                        if buffer.len() + tmp_buffer.len() > limit {
@ -190,7 +210,8 @@ impl IndexScheduler {
                                let mut request = agent.post(&documents_url);
                                request = request.set("Content-Type", "application/x-ndjson");
                                if let Some(api_key) = api_key {
-                            request = request.set("Authorization", &(format!("Bearer {api_key}")));
+                                    request = request
+                                        .set("Authorization", &(format!("Bearer {api_key}")));
                                }
                                request.send_bytes(&buffer).map_err(into_backoff_error)
                            })?;
@ -211,6 +232,16 @@ impl IndexScheduler {
                        }
                        request.send_bytes(&buffer).map_err(into_backoff_error)
                    })?;
+
+                    Ok(())
+                })
+                .map_err(|e| {
+                    Error::from_milli(
+                        milli::Error::InternalError(InternalError::PanicInThreadPool(e)),
+                        Some(uid.to_string()),
+                    )
+                })?;
+
            step.store(total_documents, atomic::Ordering::Relaxed);
        }

--- a/crates/index-scheduler/src/scheduler/test.rs
+++ b/crates/index-scheduler/src/scheduler/test.rs
@ -766,6 +766,7 @@ fn basic_get_stats() {
        "documentDeletion": 0,
        "documentEdition": 0,
        "dumpCreation": 0,
+        "export": 0,
        "indexCreation": 3,
        "indexDeletion": 0,
        "indexSwap": 0,
@ -806,6 +807,7 @@ fn basic_get_stats() {
        "documentDeletion": 0,
        "documentEdition": 0,
        "dumpCreation": 0,
+        "export": 0,
        "indexCreation": 3,
        "indexDeletion": 0,
        "indexSwap": 0,
@ -847,6 +849,7 @@ fn basic_get_stats() {
        "documentDeletion": 0,
        "documentEdition": 0,
        "dumpCreation": 0,
+        "export": 0,
        "indexCreation": 3,
        "indexDeletion": 0,
        "indexSwap": 0,
--- a/crates/milli/src/thread_pool_no_abort.rs
+++ b/crates/milli/src/thread_pool_no_abort.rs
@ -1,7 +1,7 @@
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::Arc;

-use rayon::{ThreadPool, ThreadPoolBuilder};
+use rayon::{BroadcastContext, ThreadPool, ThreadPoolBuilder};
 use thiserror::Error;

 /// A rayon ThreadPool wrapper that can catch panics in the pool
@ -32,6 +32,22 @@ impl ThreadPoolNoAbort {
        }
    }

+    pub fn broadcast<OP, R>(&self, op: OP) -> Result<Vec<R>, PanicCatched>
+    where
+        OP: Fn(BroadcastContext<'_>) -> R + Sync,
+        R: Send,
+    {
+        self.active_operations.fetch_add(1, Ordering::Relaxed);
+        let output = self.thread_pool.broadcast(op);
+        self.active_operations.fetch_sub(1, Ordering::Relaxed);
+        // While reseting the pool panic catcher we return an error if we catched one.
+        if self.pool_catched_panic.swap(false, Ordering::SeqCst) {
+            Err(PanicCatched)
+        } else {
+            Ok(output)
+        }
+    }
+
    pub fn current_num_threads(&self) -> usize {
        self.thread_pool.current_num_threads()
    }
--- a/crates/milli/src/update/index_documents/extract/mod.rs
+++ b/crates/milli/src/update/index_documents/extract/mod.rs
@ -210,7 +210,7 @@ fn run_extraction_task<FE, FS, M>(
    })
 }

-fn request_threads() -> &'static ThreadPoolNoAbort {
+pub fn request_threads() -> &'static ThreadPoolNoAbort {
    static REQUEST_THREADS: OnceLock<ThreadPoolNoAbort> = OnceLock::new();

    REQUEST_THREADS.get_or_init(|| {
--- a/crates/milli/src/update/index_documents/mod.rs
+++ b/crates/milli/src/update/index_documents/mod.rs
@ -12,6 +12,7 @@ use std::sync::Arc;

 use crossbeam_channel::{Receiver, Sender};
 use enrich::enrich_documents_batch;
+pub use extract::request_threads;
 use grenad::{Merger, MergerBuilder};
 use hashbrown::HashMap;
 use heed::types::Str;
--- a/crates/milli/src/update/mod.rs
+++ b/crates/milli/src/update/mod.rs
@ -4,7 +4,7 @@ pub use self::clear_documents::ClearDocuments;
 pub use self::concurrent_available_ids::ConcurrentAvailableIds;
 pub use self::facet::bulk::FacetsUpdateBulk;
 pub use self::facet::incremental::FacetsUpdateIncrementalInner;
-pub use self::index_documents::*;
+pub use self::index_documents::{request_threads, *};
 pub use self::indexer_config::{default_thread_pool_and_threads, IndexerConfig};
 pub use self::new::ChannelCongestion;
 pub use self::settings::{validate_embedding_settings, Setting, Settings};