Merge #2523

2523: Improve the tasks error reporting when processed in batches r=irevoire a=Kerollmops This fixes #2478 by changing the behavior of the task handler when there is an error in a batch of document addition or update. What changes is that when there is a user error in a task in a batch we now report this task as failed with the right error message but we continue to process the other tasks. A user error can be when a geo field is invalid, a document id is invalid, or missing. fixes #2582, #2478 Co-authored-by: Kerollmops <clement@meilisearch.com> Co-authored-by: ManyTheFish <many@meilisearch.com>
2025-07-03 11:57:07 +02:00 · 2022-08-16 14:15:30 +00:00 · 2022-08-16 14:15:30 +00:00 · b5f91b91c3
commit b5f91b91c3
parent 8198bb9da2 b6e6a08f7d
23 changed files with 251 additions and 222 deletions
--- a/meilisearch-http/src/helpers/env.rs
+++ b/meilisearch-http/src/helpers/env.rs
@ -1,17 +0,0 @@
-use meilisearch_lib::heed::Env;
-use walkdir::WalkDir;
-
-pub trait EnvSizer {
-    fn size(&self) -> u64;
-}
-
-impl EnvSizer for Env {
-    fn size(&self) -> u64 {
-        WalkDir::new(self.path())
-            .into_iter()
-            .filter_map(|entry| entry.ok())
-            .filter_map(|entry| entry.metadata().ok())
-            .filter(|metadata| metadata.is_file())
-            .fold(0, |acc, m| acc + m.len())
-    }
-}
--- a/meilisearch-http/src/helpers/mod.rs
+++ b/meilisearch-http/src/helpers/mod.rs
@ -1,3 +0,0 @@
-mod env;
-
-pub use env::EnvSizer;
--- a/meilisearch-http/src/lib.rs
+++ b/meilisearch-http/src/lib.rs
@ -5,7 +5,6 @@ pub mod analytics;
 pub mod task;
 #[macro_use]
 pub mod extractors;
-pub mod helpers;
 pub mod option;
 pub mod routes;

@ -30,9 +29,9 @@ pub static AUTOBATCHING_ENABLED: AtomicBool = AtomicBool::new(false);
 pub fn setup_meilisearch(opt: &Opt) -> anyhow::Result<MeiliSearch> {
    let mut meilisearch = MeiliSearch::builder();

-    // enable autobatching?
+    // disable autobatching?
    AUTOBATCHING_ENABLED.store(
-        opt.scheduler_options.enable_auto_batching,
+        !opt.scheduler_options.disable_auto_batching,
        std::sync::atomic::Ordering::Relaxed,
    );

--- a/meilisearch-http/src/task.rs
+++ b/meilisearch-http/src/task.rs
@ -231,7 +231,7 @@ pub struct TaskView {
    #[serde(serialize_with = "time::serde::rfc3339::option::serialize")]
    finished_at: Option<OffsetDateTime>,
    #[serde(skip_serializing_if = "Option::is_none")]
-    batch_uid: Option<Option<BatchId>>,
+    batch_uid: Option<BatchId>,
 }

 impl From<Task> for TaskView {
@ -380,15 +380,15 @@ impl From<Task> for TaskView {

        let duration = finished_at.zip(started_at).map(|(tf, ts)| (tf - ts));

-        let batch_uid = if AUTOBATCHING_ENABLED.load(std::sync::atomic::Ordering::Relaxed) {
-            let id = events.iter().find_map(|e| match e {
-                TaskEvent::Batched { batch_id, .. } => Some(*batch_id),
-                _ => None,
-            });
-            Some(id)
-        } else {
-            None
-        };
+        let batch_uid = AUTOBATCHING_ENABLED
+            .load(std::sync::atomic::Ordering::Relaxed)
+            .then(|| {
+                events.iter().find_map(|e| match e {
+                    TaskEvent::Batched { batch_id, .. } => Some(*batch_id),
+                    _ => None,
+                })
+            })
+            .flatten();

        Self {
            uid: id,
--- a/meilisearch-http/tests/documents/add_documents.rs
+++ b/meilisearch-http/tests/documents/add_documents.rs
@ -1,5 +1,6 @@
 use crate::common::{GetAllDocumentsOptions, Server};
 use actix_web::test;
+
 use meilisearch_http::{analytics, create_app};
 use serde_json::{json, Value};
 use time::{format_description::well_known::Rfc3339, OffsetDateTime};
@ -326,7 +327,7 @@ async fn error_add_malformed_json_documents() {
    assert_eq!(
        response["message"],
        json!(
-            r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...890123456789", expected a documents, or a sequence of documents. at line 1 column 102`."#
+            r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...890123456789012345678901234567890123456789", expected a sequence at line 1 column 102`."#
        )
    );
    assert_eq!(response["code"], json!("malformed_payload"));
@ -349,9 +350,7 @@ async fn error_add_malformed_json_documents() {
    assert_eq!(status_code, 400);
    assert_eq!(
        response["message"],
-        json!(
-            r#"The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string "0123456789012345678901234567...90123456789m", expected a documents, or a sequence of documents. at line 1 column 103`."#
-        )
+        json!("The `json` payload provided is malformed. `Couldn't serialize document value: invalid type: string \"0123456789012345678901234567...90123456789012345678901234567890123456789m\", expected a sequence at line 1 column 103`.")
    );
    assert_eq!(response["code"], json!("malformed_payload"));
    assert_eq!(response["type"], json!("invalid_request"));
@ -388,7 +387,7 @@ async fn error_add_malformed_ndjson_documents() {
    assert_eq!(
        response["message"],
        json!(
-            r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 1 column 2`."#
+            r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 2 column 2`."#
        )
    );
    assert_eq!(response["code"], json!("malformed_payload"));
@ -411,9 +410,7 @@ async fn error_add_malformed_ndjson_documents() {
    assert_eq!(status_code, 400);
    assert_eq!(
        response["message"],
-        json!(
-            r#"The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 1 column 2`."#
-        )
+        json!("The `ndjson` payload provided is malformed. `Couldn't serialize document value: key must be a string at line 2 column 2`.")
    );
    assert_eq!(response["code"], json!("malformed_payload"));
    assert_eq!(response["type"], json!("invalid_request"));
@ -1020,7 +1017,7 @@ async fn add_documents_invalid_geo_field() {
    index.wait_task(2).await;
    let (response, code) = index.get_task(2).await;
    assert_eq!(code, 200);
-    assert_eq!(response["status"], "succeeded");
+    assert_eq!(response["status"], "failed");
 }

 #[actix_rt::test]
@ -1099,3 +1096,62 @@ async fn add_documents_with_primary_key_twice() {
    let (response, _code) = index.get_task(1).await;
    assert_eq!(response["status"], "succeeded");
 }
+
+#[actix_rt::test]
+async fn batch_several_documents_addition() {
+    let server = Server::new().await;
+    let index = server.index("test");
+
+    let mut documents: Vec<_> = (0..150usize)
+        .into_iter()
+        .map(|id| {
+            json!(
+                {
+                    "id": id,
+                    "title": "foo",
+                    "desc": "bar"
+                }
+            )
+        })
+        .collect();
+
+    documents[100] = json!({"title": "error", "desc": "error"});
+
+    // enqueue batch of documents
+    let mut waiter = Vec::new();
+    for chunk in documents.chunks(30) {
+        waiter.push(index.add_documents(json!(chunk), Some("id")));
+    }
+
+    // wait first batch of documents to finish
+    futures::future::join_all(waiter).await;
+    index.wait_task(4).await;
+
+    // run a second completely failing batch
+    documents[40] = json!({"title": "error", "desc": "error"});
+    documents[70] = json!({"title": "error", "desc": "error"});
+    documents[130] = json!({"title": "error", "desc": "error"});
+    let mut waiter = Vec::new();
+    for chunk in documents.chunks(30) {
+        waiter.push(index.add_documents(json!(chunk), Some("id")));
+    }
+    // wait second batch of documents to finish
+    futures::future::join_all(waiter).await;
+    index.wait_task(9).await;
+
+    let (response, _code) = index.filtered_tasks(&[], &["failed"]).await;
+
+    // Check if only the 6th task failed
+    println!("{}", &response);
+    assert_eq!(response["results"].as_array().unwrap().len(), 5);
+
+    // Check if there are exactly 120 documents (150 - 30) in the index;
+    let (response, code) = index
+        .get_all_documents(GetAllDocumentsOptions {
+            limit: Some(200),
+            ..Default::default()
+        })
+        .await;
+    assert_eq!(code, 200, "failed with `{}`", response);
+    assert_eq!(response["results"].as_array().unwrap().len(), 120);
+}
--- a/meilisearch-http/tests/search/mod.rs
+++ b/meilisearch-http/tests/search/mod.rs
@ -708,9 +708,7 @@ async fn faceting_max_values_per_facet() {
            }),
            |response, code| {
                assert_eq!(code, 200, "{}", response);
-                let numbers = dbg!(&response)["facetDistribution"]["number"]
-                    .as_object()
-                    .unwrap();
+                let numbers = &response["facetDistribution"]["number"].as_object().unwrap();
                assert_eq!(numbers.len(), 10_000);
            },
        )