Merge #3576

3576: Add boolean support for csv documents r=irevoire a=irevoire Fixes https://github.com/meilisearch/meilisearch/issues/3572 ## What does this PR do? Add support for the boolean types in csv documents. The type definition is `boolean` and the possible values are - `true` for true - `false` for false - ` ` for null Here is an example: ```csv #id,cute:boolean 0,true 1,false 2, ``` Co-authored-by: Tamo <tamo@meilisearch.com>
2025-07-15 13:58:36 +02:00 · 2023-03-14 12:28:12 +00:00 · 2023-03-14 12:28:12 +00:00 · 70c906d4b4
commit 70c906d4b4
parent 7c9a8b1e1b 0f33a65468
3 changed files with 164 additions and 62 deletions
--- a/meilisearch/tests/documents/add_documents.rs
+++ b/meilisearch/tests/documents/add_documents.rs
@ -279,6 +279,81 @@ async fn add_csv_document() {
    "###);
 }

+#[actix_rt::test]
+async fn add_csv_document_with_types() {
+    let server = Server::new().await;
+    let index = server.index("pets");
+
+    let document = "#id:number,name:string,race:string,age:number,cute:boolean
+0,jean,bernese mountain,2.5,true
+1,,,,
+2,lilou,pug,-2,false";
+
+    let (response, code) = index.raw_update_documents(document, Some("text/csv"), "").await;
+    snapshot!(code, @"202 Accepted");
+    snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###"
+    {
+      "taskUid": 0,
+      "indexUid": "pets",
+      "status": "enqueued",
+      "type": "documentAdditionOrUpdate",
+      "enqueuedAt": "[date]"
+    }
+    "###);
+    let response = index.wait_task(response["taskUid"].as_u64().unwrap()).await;
+    snapshot!(json_string!(response, { ".enqueuedAt" => "[date]", ".startedAt" => "[date]", ".finishedAt" => "[date]", ".duration" => "[duration]" }), @r###"
+    {
+      "uid": 0,
+      "indexUid": "pets",
+      "status": "succeeded",
+      "type": "documentAdditionOrUpdate",
+      "canceledBy": null,
+      "details": {
+        "receivedDocuments": 3,
+        "indexedDocuments": 3
+      },
+      "error": null,
+      "duration": "[duration]",
+      "enqueuedAt": "[date]",
+      "startedAt": "[date]",
+      "finishedAt": "[date]"
+    }
+    "###);
+
+    let (documents, code) = index.get_all_documents(GetAllDocumentsOptions::default()).await;
+    snapshot!(code, @"200 OK");
+    snapshot!(json_string!(documents), @r###"
+    {
+      "results": [
+        {
+          "#id": 0,
+          "name": "jean",
+          "race": "bernese mountain",
+          "age": 2.5,
+          "cute": true
+        },
+        {
+          "#id": 1,
+          "name": null,
+          "race": null,
+          "age": null,
+          "cute": null
+        },
+        {
+          "#id": 2,
+          "name": "lilou",
+          "race": "pug",
+          "age": -2,
+          "cute": false
+        }
+      ],
+      "offset": 0,
+      "limit": 20,
+      "total": 3
+    }
+    "###);
+}
+
 #[actix_rt::test]
 async fn add_csv_document_with_custom_delimiter() {
    let server = Server::new().await;
@ -343,6 +418,40 @@ async fn add_csv_document_with_custom_delimiter() {
    "###);
 }

+#[actix_rt::test]
+async fn add_csv_document_with_types_error() {
+    let server = Server::new().await;
+    let index = server.index("pets");
+
+    let document = "#id:number,a:boolean,b:number
+0,doggo,1";
+
+    let (response, code) = index.raw_update_documents(document, Some("text/csv"), "").await;
+    snapshot!(code, @"400 Bad Request");
+    snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###"
+    {
+      "message": "The `csv` payload provided is malformed: `Error parsing boolean \"doggo\" at line 1: provided string was not `true` or `false``.",
+      "code": "malformed_payload",
+      "type": "invalid_request",
+      "link": "https://docs.meilisearch.com/errors#malformed_payload"
+    }
+    "###);
+
+    let document = "#id:number,a:boolean,b:number
+0,true,doggo";
+
+    let (response, code) = index.raw_update_documents(document, Some("text/csv"), "").await;
+    snapshot!(code, @"400 Bad Request");
+    snapshot!(json_string!(response, { ".enqueuedAt" => "[date]" }), @r###"
+    {
+      "message": "The `csv` payload provided is malformed: `Error parsing number \"doggo\" at line 1: invalid float literal`.",
+      "code": "malformed_payload",
+      "type": "invalid_request",
+      "link": "https://docs.meilisearch.com/errors#malformed_payload"
+    }
+    "###);
+}
+
 /// any other content-type is must be refused
 #[actix_rt::test]
 async fn error_add_documents_test_bad_content_types() {
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@ -114,14 +114,15 @@ impl<W: Write> DocumentsBatchBuilder<W> {
                self.value_buffer.clear();

                let value = &record[*i];
+                let trimmed_value = value.trim();
                match type_ {
                    AllowedType::Number => {
-                        if value.trim().is_empty() {
+                        if trimmed_value.is_empty() {
                            to_writer(&mut self.value_buffer, &Value::Null)?;
-                        } else if let Ok(integer) = value.trim().parse::<i64>() {
+                        } else if let Ok(integer) = trimmed_value.parse::<i64>() {
                            to_writer(&mut self.value_buffer, &integer)?;
                        } else {
-                            match value.trim().parse::<f64>() {
+                            match trimmed_value.parse::<f64>() {
                                Ok(float) => {
                                    to_writer(&mut self.value_buffer, &float)?;
                                }
@ -135,6 +136,24 @@ impl<W: Write> DocumentsBatchBuilder<W> {
                            }
                        }
                    }
+                    AllowedType::Boolean => {
+                        if trimmed_value.is_empty() {
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
+                        } else {
+                            match trimmed_value.parse::<bool>() {
+                                Ok(bool) => {
+                                    to_writer(&mut self.value_buffer, &bool)?;
+                                }
+                                Err(error) => {
+                                    return Err(Error::ParseBool {
+                                        error,
+                                        line,
+                                        value: value.to_string(),
+                                    });
+                                }
+                            }
+                        }
+                    }
                    AllowedType::String => {
                        if value.is_empty() {
                            to_writer(&mut self.value_buffer, &Value::Null)?;
@ -173,6 +192,7 @@ impl<W: Write> DocumentsBatchBuilder<W> {
 #[derive(Debug)]
 enum AllowedType {
    String,
+    Boolean,
    Number,
 }

@ -181,6 +201,7 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) {
    match header.rsplit_once(':') {
        Some((field_name, field_type)) => match field_type {
            "string" => (field_name, AllowedType::String),
+            "boolean" => (field_name, AllowedType::Boolean),
            "number" => (field_name, AllowedType::Number),
            // if the pattern isn't reconized, we keep the whole field.
            _otherwise => (header, AllowedType::String),
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -3,7 +3,7 @@ mod enriched;
 mod reader;
 mod serde_impl;

-use std::fmt::{self, Debug};
+use std::fmt::Debug;
 use std::io;
 use std::str::Utf8Error;

@ -87,71 +87,30 @@ impl DocumentsBatchIndex {
    }
 }

-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 pub enum Error {
+    #[error("Error parsing number {value:?} at line {line}: {error}")]
    ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
+    #[error("Error parsing boolean {value:?} at line {line}: {error}")]
+    ParseBool { error: std::str::ParseBoolError, line: usize, value: String },
+    #[error("Invalid document addition format, missing the documents batch index.")]
    InvalidDocumentFormat,
+    #[error("Invalid enriched data.")]
    InvalidEnrichedData,
-    InvalidUtf8(Utf8Error),
-    Csv(csv::Error),
-    Json(serde_json::Error),
+    #[error(transparent)]
+    InvalidUtf8(#[from] Utf8Error),
+    #[error(transparent)]
+    Csv(#[from] csv::Error),
+    #[error(transparent)]
+    Json(#[from] serde_json::Error),
+    #[error(transparent)]
    Serialize(serde_json::Error),
-    Grenad(grenad::Error),
-    Io(io::Error),
+    #[error(transparent)]
+    Grenad(#[from] grenad::Error),
+    #[error(transparent)]
+    Io(#[from] io::Error),
 }

-impl From<csv::Error> for Error {
-    fn from(e: csv::Error) -> Self {
-        Self::Csv(e)
-    }
-}
-
-impl From<io::Error> for Error {
-    fn from(other: io::Error) -> Self {
-        Self::Io(other)
-    }
-}
-
-impl From<serde_json::Error> for Error {
-    fn from(other: serde_json::Error) -> Self {
-        Self::Json(other)
-    }
-}
-
-impl From<grenad::Error> for Error {
-    fn from(other: grenad::Error) -> Self {
-        Self::Grenad(other)
-    }
-}
-
-impl From<Utf8Error> for Error {
-    fn from(other: Utf8Error) -> Self {
-        Self::InvalidUtf8(other)
-    }
-}
-
-impl fmt::Display for Error {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        match self {
-            Error::ParseFloat { error, line, value } => {
-                write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
-            }
-            Error::InvalidDocumentFormat => {
-                f.write_str("Invalid document addition format, missing the documents batch index.")
-            }
-            Error::InvalidEnrichedData => f.write_str("Invalid enriched data."),
-            Error::InvalidUtf8(e) => write!(f, "{}", e),
-            Error::Io(e) => write!(f, "{}", e),
-            Error::Serialize(e) => write!(f, "{}", e),
-            Error::Grenad(e) => write!(f, "{}", e),
-            Error::Csv(e) => write!(f, "{}", e),
-            Error::Json(e) => write!(f, "{}", e),
-        }
-    }
-}
-
-impl std::error::Error for Error {}
-
 #[cfg(test)]
 pub fn objects_from_json_value(json: serde_json::Value) -> Vec<crate::Object> {
    let documents = match json {
@ -274,6 +233,19 @@ mod test {
        ]);
    }

+    #[test]
+    fn csv_types_dont_panic() {
+        let csv1_content =
+            "id:number,b:boolean,c,d:number\n1,,,\n2,true,doggo,2\n3,false,the best doggo,-2\n4,,\"Hello, World!\",2.5";
+        let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv1).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
+    }
+
    #[test]
    fn out_of_order_csv_fields() {
        let csv1_content = "id:number,b\n1,0";