merge with main

2025-07-04 04:17:10 +02:00 · 2021-11-06 16:34:30 +01:00 · 2021-11-06 16:34:30 +01:00 · 6831c23449
commit 6831c23449
parent 5c01e9bf7c a2fc74f010
25 changed files with 824 additions and 953 deletions
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -1,7 +1,7 @@
 #![allow(dead_code)]

 use std::fs::{create_dir_all, remove_dir_all, File};
-use std::io::{self, Cursor, Read, Seek};
+use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
 use std::num::ParseFloatError;
 use std::path::Path;

@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
    DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
 }

-fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

-    let values = serde_json::Deserializer::from_reader(reader)
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
-    for document in values {
-        let document = document?;
-        documents.add_documents(document)?;
+    let mut buf = String::new();
+    let mut reader = BufReader::new(reader);
+
+    while reader.read_line(&mut buf)? > 0 {
+        documents.extend_from_json(&mut buf.as_bytes())?;
    }
    documents.finish()?;

    Ok(writer.into_inner())
 }

-fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

-    let json: serde_json::Value = serde_json::from_reader(reader)?;
-    documents.add_documents(json)?;
+    documents.extend_from_json(reader)?;
    documents.finish()?;

    Ok(writer.into_inner())
 }

-fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-
-    let iter = CSVDocumentDeserializer::from_reader(reader)?;
-
-    for doc in iter {
-        let doc = doc?;
-        documents.add_documents(doc)?;
-    }
-
-    documents.finish()?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;

    Ok(writer.into_inner())
 }
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "cli"
-version = "0.1.0"
+version = "0.20.2"
 edition = "2018"
 description = "A CLI to interact with a milli index"

--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io::{stdin, Cursor, Read};
+use std::io::{stdin, BufRead, BufReader, Cursor, Read};
 use std::path::PathBuf;
 use std::str::FromStr;

@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use milli::update::UpdateIndexingStep::{
    ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
 };
-use serde_json::{Map, Value};
 use structopt::StructOpt;

 #[cfg(target_os = "linux")]
@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

-    let values = serde_json::Deserializer::from_reader(reader)
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
-    for document in values {
-        let document = document?;
-        documents.add_documents(document)?;
+    let mut buf = String::new();
+    let mut reader = BufReader::new(reader);
+
+    while reader.read_line(&mut buf)? > 0 {
+        documents.extend_from_json(&mut buf.as_bytes())?;
    }
    documents.finish()?;

@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

-    let json: serde_json::Value = serde_json::from_reader(reader)?;
-    documents.add_documents(json)?;
+    documents.extend_from_json(reader)?;
    documents.finish()?;

    Ok(writer.into_inner())
@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {

 fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-
-    let mut records = csv::Reader::from_reader(reader);
-    let iter = records.deserialize::<Map<String, Value>>();
-
-    for doc in iter {
-        let doc = doc?;
-        documents.add_documents(doc)?;
-    }
-
-    documents.finish()?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;

    Ok(writer.into_inner())
 }
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"

--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"

--- a/http-ui/src/documents_from_csv.rs
+++ b/http-ui/src/documents_from_csv.rs
@ -1,285 +0,0 @@
-use std::io::{Read, Result as IoResult};
-use std::num::ParseFloatError;
-
-use serde_json::{Map, Value};
-
-enum AllowedType {
-    String,
-    Number,
-}
-
-fn parse_csv_header(header: &str) -> (String, AllowedType) {
-    // if there are several separators we only split on the last one.
-    match header.rsplit_once(':') {
-        Some((field_name, field_type)) => match field_type {
-            "string" => (field_name.to_string(), AllowedType::String),
-            "number" => (field_name.to_string(), AllowedType::Number),
-            // we may return an error in this case.
-            _otherwise => (header.to_string(), AllowedType::String),
-        },
-        None => (header.to_string(), AllowedType::String),
-    }
-}
-
-pub struct CSVDocumentDeserializer<R>
-where
-    R: Read,
-{
-    documents: csv::StringRecordsIntoIter<R>,
-    headers: Vec<(String, AllowedType)>,
-}
-
-impl<R: Read> CSVDocumentDeserializer<R> {
-    pub fn from_reader(reader: R) -> IoResult<Self> {
-        let mut records = csv::Reader::from_reader(reader);
-
-        let headers = records.headers()?.into_iter().map(parse_csv_header).collect();
-
-        Ok(Self { documents: records.into_records(), headers })
-    }
-}
-
-impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
-    type Item = anyhow::Result<Map<String, Value>>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let csv_document = self.documents.next()?;
-
-        match csv_document {
-            Ok(csv_document) => {
-                let mut document = Map::new();
-
-                for ((field_name, field_type), value) in
-                    self.headers.iter().zip(csv_document.into_iter())
-                {
-                    let parsed_value: Result<Value, ParseFloatError> = match field_type {
-                        AllowedType::Number => {
-                            value.parse::<f64>().map(Value::from).map_err(Into::into)
-                        }
-                        AllowedType::String => Ok(Value::String(value.to_string())),
-                    };
-
-                    match parsed_value {
-                        Ok(value) => drop(document.insert(field_name.to_string(), value)),
-                        Err(_e) => {
-                            return Some(Err(anyhow::anyhow!(
-                                "Value '{}' is not a valid number",
-                                value
-                            )))
-                        }
-                    }
-                }
-
-                Some(Ok(document))
-            }
-            Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))),
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use serde_json::json;
-
-    use super::*;
-
-    #[test]
-    fn simple_csv_document() {
-        let documents = r#"city,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn coma_in_field() {
-        let documents = r#"city,country,pop
-"Boston","United, States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United, States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn quote_in_field() {
-        let documents = r#"city,country,pop
-"Boston","United"" States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United\" States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn integer_in_field() {
-        let documents = r#"city,country,pop:number
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United States",
-                "pop": 4628910.0,
-            })
-        );
-    }
-
-    #[test]
-    fn float_in_field() {
-        let documents = r#"city,country,pop:number
-"Boston","United States","4628910.01""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United States",
-                "pop": 4628910.01,
-            })
-        );
-    }
-
-    #[test]
-    fn several_double_dot_in_header() {
-        let documents = r#"city:love:string,country:state,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city:love": "Boston",
-                "country:state": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn ending_by_double_dot_in_header() {
-        let documents = r#"city:,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city:": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn starting_by_double_dot_in_header() {
-        let documents = r#":city,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                ":city": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn starting_by_double_dot_in_header2() {
-        let documents = r#":string,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn double_double_dot_in_header() {
-        let documents = r#"city::string,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city:": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn bad_type_in_header() {
-        let documents = r#"city,country:number,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert!(csv_iter.next().unwrap().is_err());
-    }
-
-    #[test]
-    fn bad_column_count1() {
-        let documents = r#"city,country,pop
-"Boston","United States","4628910", "too much""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert!(csv_iter.next().unwrap().is_err());
-    }
-
-    #[test]
-    fn bad_column_count2() {
-        let documents = r#"city,country,pop
-"Boston","United States""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert!(csv_iter.next().unwrap().is_err());
-    }
-}
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -1,10 +1,9 @@
-mod documents_from_csv;
 mod update_store;

 use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fmt::Display;
 use std::fs::{create_dir_all, File};
-use std::io::Cursor;
+use std::io::{BufRead, BufReader, Cursor};
 use std::net::SocketAddr;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::PathBuf;
@ -40,7 +39,6 @@ use warp::http::Response;
 use warp::Filter;

 use self::update_store::UpdateStore;
-use crate::documents_from_csv::CSVDocumentDeserializer;

 #[cfg(target_os = "linux")]
 #[global_allocator]
@ -1048,11 +1046,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

-    let values = serde_json::Deserializer::from_reader(reader)
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
-    for document in values {
-        let document = document?;
-        documents.add_documents(document)?;
+    let mut buf = String::new();
+    let mut reader = BufReader::new(reader);
+
+    while reader.read_line(&mut buf)? > 0 {
+        documents.extend_from_json(&mut buf.as_bytes())?;
    }
    documents.finish()?;

@ -1063,8 +1061,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;

-    let json: serde_json::Value = serde_json::from_reader(reader)?;
-    documents.add_documents(json)?;
+    documents.extend_from_json(reader)?;
    documents.finish()?;

    Ok(writer.into_inner())
@ -1072,16 +1069,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {

 fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-
-    let iter = CSVDocumentDeserializer::from_reader(reader)?;
-
-    for doc in iter {
-        let doc = doc?;
-        documents.add_documents(doc)?;
-    }
-
-    documents.finish()?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;

    Ok(writer.into_inner())
 }
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"

--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Kerollmops <clement@meilisearch.com>"]
 edition = "2018"

@ -46,6 +46,7 @@ itertools = "0.10.0"
 # logging
 log = "0.4.14"
 logging_timer = "1.0.0"
+csv = "1.1.6"

 [dev-dependencies]
 big_s = "1.0.2"
--- a/milli/src/asc_desc.rs
+++ b/milli/src/asc_desc.rs
@ -28,12 +28,12 @@ impl fmt::Display for AscDescError {
                write!(f, "Longitude must be contained between -180 and 180 degrees.",)
            }
            Self::InvalidSyntax { name } => {
-                write!(f, "invalid asc/desc syntax for {}.", name)
+                write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
            }
            Self::ReservedKeyword { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a asc/desc rule.",
+                    "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.",
                    name
                )
            }
@ -192,18 +192,18 @@ impl fmt::Display for SortError {
            Self::BadGeoPointUsage { name } => {
                write!(
                    f,
-                    "invalid syntax for the `_geoPoint` parameter: `{}`. \
-                    Usage: `_geoPoint(latitude, longitude):asc`.",
+                    "Invalid syntax for the geo parameter: expected expression formated like \
+                    `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.",
                    name
                )
            }
            Self::InvalidName { name } => {
-                write!(f, "invalid syntax for the sort parameter `{}`.", name)
+                write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
            }
            Self::ReservedName { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a sort expression.",
+                    "`{}` is a reserved keyword and thus can't be used as a sort expression.",
                    name
                )
            }
@ -211,7 +211,7 @@ impl fmt::Display for SortError {
                write!(
                    f,
                    "`{}` is a reserved keyword and thus can't be used as a sort expression. \
-                    Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.",
+                    Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.",
                    name,
                )
            }
@ -318,50 +318,4 @@ mod tests {
            );
        }
    }
-
-    #[test]
-    fn sort_error_message() {
-        let errors = [
-            (
-                AscDescError::InvalidSyntax { name: S("truc:machin") },
-                S("invalid syntax for the sort parameter `truc:machin`."),
-            ),
-            (
-                AscDescError::InvalidSyntax { name: S("hello:world") },
-                S("invalid syntax for the sort parameter `hello:world`."),
-            ),
-            (
-                AscDescError::ReservedKeyword { name: S("_geo") },
-                S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
-            ),
-            (
-                AscDescError::ReservedKeyword { name: S("_geoDistance") },
-                S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.")
-            ),
-            (
-                AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") },
-                S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
-            ),
-            (
-                AscDescError::InvalidLatitude,
-                S("Latitude must be contained between -90 and 90 degrees."),
-            ),
-            (
-                AscDescError::InvalidLongitude,
-                S("Longitude must be contained between -180 and 180 degrees."),
-            ),
-        ];
-
-        for (asc_desc_error, expected_message) in errors {
-            let sort_error = SortError::from(asc_desc_error);
-            assert_eq!(
-                sort_error.to_string(),
-                expected_message,
-                "was expecting {} for the error {:?} but instead got {}",
-                expected_message,
-                sort_error,
-                sort_error.to_string()
-            );
-        }
-    }
 }
--- a/milli/src/criterion.rs
+++ b/milli/src/criterion.rs
@ -17,23 +17,27 @@ pub enum CriterionError {
 impl fmt::Display for CriterionError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
-            Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name),
+            Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name),
            Self::ReservedName { name } => {
-                write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name)
+                write!(
+                    f,
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule",
+                    name
+                )
            }
            Self::ReservedNameForSort { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a ranking rule. \
-{} can only be used for sorting at search time",
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule. \
+`{}` can only be used for sorting at search time",
                    name, name
                )
            }
            Self::ReservedNameForFilter { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a ranking rule. \
-{} can only be used for filtering at search time",
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule. \
+`{}` can only be used for filtering at search time",
                    name, name
                )
            }
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@ -1,16 +1,20 @@
+use std::collections::BTreeMap;
 use std::io;
+use std::io::{Cursor, Write};

 use byteorder::{BigEndian, WriteBytesExt};
-use serde::ser::Serialize;
+use serde::Deserializer;
+use serde_json::Value;

-use super::serde::DocumentSerializer;
+use super::serde::DocumentVisitor;
 use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
+use crate::FieldId;

 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
 /// format used by milli.
 ///
 /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
-/// iterate other the documents.
+/// iterate over the documents.
 ///
 /// ## example:
 /// ```
@ -18,43 +22,48 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 /// use serde_json::json;
 /// use std::io::Cursor;
 ///
+/// let json = r##"{"id": 1, "name": "foo"}"##;
 /// let mut writer = Cursor::new(Vec::new());
 /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
-/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap();
+/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
 /// builder.finish().unwrap();
 /// ```
 pub struct DocumentBatchBuilder<W> {
-    serializer: DocumentSerializer<W>,
+    inner: ByteCounter<W>,
+    index: DocumentsBatchIndex,
+    obkv_buffer: Vec<u8>,
+    value_buffer: Vec<u8>,
+    values: BTreeMap<FieldId, Value>,
+    count: usize,
 }

 impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
    pub fn new(writer: W) -> Result<Self, Error> {
-        let index = DocumentsBatchIndex::new();
+        let index = DocumentsBatchIndex::default();
        let mut writer = ByteCounter::new(writer);
        // add space to write the offset of the metadata at the end of the writer
        writer.write_u64::<BigEndian>(0)?;

-        let serializer =
-            DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true };
-
-        Ok(Self { serializer })
+        Ok(Self {
+            inner: writer,
+            index,
+            obkv_buffer: Vec::new(),
+            value_buffer: Vec::new(),
+            values: BTreeMap::new(),
+            count: 0,
+        })
    }

    /// Returns the number of documents that have been written to the builder.
    pub fn len(&self) -> usize {
-        self.serializer.count
+        self.count
    }

    /// This method must be called after the document addition is terminated. It will put the
    /// metadata at the end of the file, and write the metadata offset at the beginning on the
    /// file.
-    pub fn finish(self) -> Result<(), Error> {
-        let DocumentSerializer {
-            writer: ByteCounter { mut writer, count: offset },
-            index,
-            count,
-            ..
-        } = self.serializer;
+    pub fn finish(self) -> Result<usize, Error> {
+        let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;

        let meta = DocumentsMetadata { count, index };

@ -65,16 +74,481 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {

        writer.flush()?;

-        Ok(())
+        Ok(count)
    }

-    /// Adds documents to the builder.
+    /// Extends the builder with json documents from a reader.
+    pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
+        let mut de = serde_json::Deserializer::from_reader(reader);
+
+        let mut visitor = DocumentVisitor {
+            inner: &mut self.inner,
+            index: &mut self.index,
+            obkv_buffer: &mut self.obkv_buffer,
+            value_buffer: &mut self.value_buffer,
+            values: &mut self.values,
+            count: &mut self.count,
+        };
+
+        de.deserialize_any(&mut visitor).map_err(Error::JsonError)?
+    }
+
+    /// Creates a builder from a reader of CSV documents.
    ///
-    /// The internal index is updated with the fields found
-    /// in the documents. Document must either be a map or a sequences of map, anything else will
-    /// fail.
-    pub fn add_documents<T: Serialize>(&mut self, document: T) -> Result<(), Error> {
-        document.serialize(&mut self.serializer)?;
-        Ok(())
+    /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
+    /// optimisations, and extending from another CSV is not allowed.
+    pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
+        let mut this = Self::new(writer)?;
+        // Ensure that this is the first and only addition made with this builder
+        debug_assert!(this.index.is_empty());
+
+        let mut records = csv::Reader::from_reader(reader);
+
+        let headers = records
+            .headers()?
+            .into_iter()
+            .map(parse_csv_header)
+            .map(|(k, t)| (this.index.insert(&k), t))
+            .collect::<BTreeMap<_, _>>();
+
+        for (i, record) in records.into_records().enumerate() {
+            let record = record?;
+            this.obkv_buffer.clear();
+            let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
+            for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
+                let value = match ty {
+                    AllowedType::Number => {
+                        value.parse::<f64>().map(Value::from).map_err(|error| {
+                            Error::ParseFloat {
+                                error,
+                                // +1 for the header offset.
+                                line: i + 1,
+                                value: value.to_string(),
+                            }
+                        })?
+                    }
+                    AllowedType::String => Value::String(value.to_string()),
+                };
+
+                this.value_buffer.clear();
+                serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
+                writer.insert(*fid, &this.value_buffer)?;
+            }
+
+            this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
+            this.inner.write_all(&this.obkv_buffer)?;
+
+            this.count += 1;
+        }
+
+        Ok(this)
+    }
+}
+
+#[derive(Debug)]
+enum AllowedType {
+    String,
+    Number,
+}
+
+fn parse_csv_header(header: &str) -> (String, AllowedType) {
+    // if there are several separators we only split on the last one.
+    match header.rsplit_once(':') {
+        Some((field_name, field_type)) => match field_type {
+            "string" => (field_name.to_string(), AllowedType::String),
+            "number" => (field_name.to_string(), AllowedType::Number),
+            // if the pattern isn't reconized, we keep the whole field.
+            _otherwise => (header.to_string(), AllowedType::String),
+        },
+        None => (header.to_string(), AllowedType::String),
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use std::io::Cursor;
+
+    use serde_json::{json, Map};
+
+    use super::*;
+    use crate::documents::DocumentBatchReader;
+
+    fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
+        let mut map = Map::new();
+
+        for (fid, value) in obkv.iter() {
+            let field_name = index.name(fid).unwrap().clone();
+            let value: Value = serde_json::from_slice(value).unwrap();
+
+            map.insert(field_name, value);
+        }
+
+        Value::Object(map)
+    }
+
+    #[test]
+    fn add_single_documents_json() {
+        let mut cursor = Cursor::new(Vec::new());
+        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+
+        let json = serde_json::json!({
+            "id": 1,
+            "field": "hello!",
+        });
+
+        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+
+        let json = serde_json::json!({
+            "blabla": false,
+            "field": "hello!",
+            "id": 1,
+        });
+
+        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+
+        assert_eq!(builder.len(), 2);
+
+        builder.finish().unwrap();
+
+        cursor.set_position(0);
+
+        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 2);
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 3);
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
+
+    #[test]
+    fn add_documents_seq_json() {
+        let mut cursor = Cursor::new(Vec::new());
+        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+
+        let json = serde_json::json!([{
+            "id": 1,
+            "field": "hello!",
+        },{
+            "blabla": false,
+            "field": "hello!",
+            "id": 1,
+        }
+        ]);
+
+        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+
+        assert_eq!(builder.len(), 2);
+
+        builder.finish().unwrap();
+
+        cursor.set_position(0);
+
+        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 2);
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 3);
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
+
+    #[test]
+    fn add_documents_csv() {
+        let mut cursor = Cursor::new(Vec::new());
+
+        let csv = "id:number,field:string\n1,hello!\n2,blabla";
+
+        let builder =
+            DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
+        builder.finish().unwrap();
+
+        cursor.set_position(0);
+
+        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 2);
+        assert_eq!(document.iter().count(), 2);
+
+        let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(document.iter().count(), 2);
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
+
+    #[test]
+    fn simple_csv_document() {
+        let documents = r#"city,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
+
+    #[test]
+    fn coma_in_field() {
+        let documents = r#"city,country,pop
+"Boston","United, States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United, States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn quote_in_field() {
+        let documents = r#"city,country,pop
+"Boston","United"" States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United\" States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn integer_in_field() {
+        let documents = r#"city,country,pop:number
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": 4628910.0,
+            })
+        );
+    }
+
+    #[test]
+    fn float_in_field() {
+        let documents = r#"city,country,pop:number
+"Boston","United States","4628910.01""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": 4628910.01,
+            })
+        );
+    }
+
+    #[test]
+    fn several_colon_in_header() {
+        let documents = r#"city:love:string,country:state,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city:love": "Boston",
+                "country:state": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn ending_by_colon_in_header() {
+        let documents = r#"city:,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city:": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn starting_by_colon_in_header() {
+        let documents = r#":city,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                ":city": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[ignore]
+    #[test]
+    fn starting_by_colon_in_header2() {
+        let documents = r#":string,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+
+        assert!(reader.next_document_with_index().is_err());
+    }
+
+    #[test]
+    fn double_colon_in_header() {
+        let documents = r#"city::string,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city:": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn bad_type_in_header() {
+        let documents = r#"city,country:number,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        assert!(
+            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
+        );
+    }
+
+    #[test]
+    fn bad_column_count1() {
+        let documents = r#"city,country,pop
+"Boston","United States","4628910", "too much""#;
+
+        let mut buf = Vec::new();
+        assert!(
+            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
+        );
+    }
+
+    #[test]
+    fn bad_column_count2() {
+        let documents = r#"city,country,pop
+"Boston","United States""#;
+
+        let mut buf = Vec::new();
+        assert!(
+            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
+        );
    }
 }
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -7,7 +7,8 @@ mod builder;
 mod reader;
 mod serde;

-use std::{fmt, io};
+use std::fmt::{self, Debug};
+use std::io;

 use ::serde::{Deserialize, Serialize};
 use bimap::BiHashMap;
@ -17,7 +18,38 @@ pub use reader::DocumentBatchReader;
 use crate::FieldId;

 /// A bidirectional map that links field ids to their name in a document batch.
-pub type DocumentsBatchIndex = BiHashMap<FieldId, String>;
+#[derive(Default, Debug, Serialize, Deserialize)]
+pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
+
+impl DocumentsBatchIndex {
+    /// Insert the field in the map, or return it's field id if it doesn't already exists.
+    pub fn insert(&mut self, field: &str) -> FieldId {
+        match self.0.get_by_right(field) {
+            Some(field_id) => *field_id,
+            None => {
+                let field_id = self.0.len() as FieldId;
+                self.0.insert(field_id, field.to_string());
+                field_id
+            }
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    pub fn iter(&self) -> bimap::hash::Iter<FieldId, String> {
+        self.0.iter()
+    }
+
+    pub fn name(&self, id: FieldId) -> Option<&String> {
+        self.0.get_by_left(&id)
+    }
+}

 #[derive(Debug, Serialize, Deserialize)]
 struct DocumentsMetadata {
@ -50,14 +82,22 @@ impl<W: io::Write> io::Write for ByteCounter<W> {

 #[derive(Debug)]
 pub enum Error {
+    ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
    InvalidDocumentFormat,
    Custom(String),
    JsonError(serde_json::Error),
+    CsvError(csv::Error),
    Serialize(bincode::Error),
    Io(io::Error),
    DocumentTooLarge,
 }

+impl From<csv::Error> for Error {
+    fn from(e: csv::Error) -> Self {
+        Self::CsvError(e)
+    }
+}
+
 impl From<io::Error> for Error {
    fn from(other: io::Error) -> Self {
        Self::Io(other)
@ -70,15 +110,25 @@ impl From<bincode::Error> for Error {
    }
 }

+impl From<serde_json::Error> for Error {
+    fn from(other: serde_json::Error) -> Self {
+        Self::JsonError(other)
+    }
+}
+
 impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
+            Error::ParseFloat { error, line, value } => {
+                write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
+            }
            Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
            Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
            Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
-            Error::Io(e) => e.fmt(f),
+            Error::Io(e) => write!(f, "{}", e),
            Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
-            Error::Serialize(e) => e.fmt(f),
+            Error::Serialize(e) => write!(f, "{}", e),
+            Error::CsvError(e) => write!(f, "{}", e),
        }
    }
 }
@ -92,7 +142,8 @@ macro_rules! documents {
        let documents = serde_json::json!($data);
        let mut writer = std::io::Cursor::new(Vec::new());
        let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
-        builder.add_documents(documents).unwrap();
+        let documents = serde_json::to_vec(&documents).unwrap();
+        builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
        builder.finish().unwrap();

        writer.set_position(0);
@ -103,6 +154,8 @@ macro_rules! documents {

 #[cfg(test)]
 mod test {
+    use std::io::Cursor;
+
    use serde_json::{json, Value};

    use super::*;
@ -119,12 +172,14 @@ mod test {
            "bool": true
        });

+        let json = serde_json::to_vec(&json).unwrap();
+
        let mut v = Vec::new();
        let mut cursor = io::Cursor::new(&mut v);

        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();

-        builder.add_documents(json).unwrap();
+        builder.extend_from_json(Cursor::new(json)).unwrap();

        builder.finish().unwrap();

@ -148,13 +203,16 @@ mod test {
            "toto": false,
        });

+        let doc1 = serde_json::to_vec(&doc1).unwrap();
+        let doc2 = serde_json::to_vec(&doc2).unwrap();
+
        let mut v = Vec::new();
        let mut cursor = io::Cursor::new(&mut v);

        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();

-        builder.add_documents(doc1).unwrap();
-        builder.add_documents(doc2).unwrap();
+        builder.extend_from_json(Cursor::new(doc1)).unwrap();
+        builder.extend_from_json(Cursor::new(doc2)).unwrap();

        builder.finish().unwrap();

@ -177,12 +235,14 @@ mod test {
            { "tata": "hello" },
        ]);

+        let docs = serde_json::to_vec(&docs).unwrap();
+
        let mut v = Vec::new();
        let mut cursor = io::Cursor::new(&mut v);

        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();

-        builder.add_documents(docs).unwrap();
+        builder.extend_from_json(Cursor::new(docs)).unwrap();

        builder.finish().unwrap();

@ -210,11 +270,13 @@ mod test {
            { "tata": "hello" },
        ]]);

-        assert!(builder.add_documents(docs).is_err());
+        let docs = serde_json::to_vec(&docs).unwrap();
+        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());

        let docs = json!("hello");
+        let docs = serde_json::to_vec(&docs).unwrap();

-        assert!(builder.add_documents(docs).is_err());
+        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
    }

    #[test]
--- a/milli/src/documents/serde.rs
+++ b/milli/src/documents/serde.rs
@ -1,474 +1,134 @@
 use std::collections::BTreeMap;
-use std::convert::TryInto;
-use std::io::Cursor;
-use std::{fmt, io};
+use std::fmt;
+use std::io::{Cursor, Write};

-use byteorder::{BigEndian, WriteBytesExt};
-use obkv::KvWriter;
-use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer};
+use byteorder::WriteBytesExt;
+use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
+use serde::Deserialize;
 use serde_json::Value;

 use super::{ByteCounter, DocumentsBatchIndex, Error};
 use crate::FieldId;

-pub struct DocumentSerializer<W> {
-    pub writer: ByteCounter<W>,
-    pub buffer: Vec<u8>,
-    pub index: DocumentsBatchIndex,
-    pub count: usize,
-    pub allow_seq: bool,
-}
-
-impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> {
-    type Ok = ();
-
-    type Error = Error;
-
-    type SerializeSeq = SeqSerializer<'a, W>;
-    type SerializeTuple = Impossible<(), Self::Error>;
-    type SerializeTupleStruct = Impossible<(), Self::Error>;
-    type SerializeTupleVariant = Impossible<(), Self::Error>;
-    type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>;
-    type SerializeStruct = Impossible<(), Self::Error>;
-    type SerializeStructVariant = Impossible<(), Self::Error>;
-    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
-        self.buffer.clear();
-        let cursor = io::Cursor::new(&mut self.buffer);
-        self.count += 1;
-        let map_serializer = MapSerializer {
-            map: KvWriter::new(cursor),
-            index: &mut self.index,
-            writer: &mut self.writer,
-            mapped_documents: BTreeMap::new(),
-        };
-
-        Ok(map_serializer)
-    }
-
-    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
-        if self.allow_seq {
-            // Only allow sequence of documents of depth 1.
-            self.allow_seq = false;
-            Ok(SeqSerializer { serializer: self })
-        } else {
-            Err(Error::InvalidDocumentFormat)
+macro_rules! tri {
+    ($e:expr) => {
+        match $e {
+            Ok(r) => r,
+            Err(e) => return Ok(Err(e.into())),
        }
-    }
+    };
+}

-    fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
+struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);

-    fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
+impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
+    type Value = FieldId;

-    fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
    where
-        T: Serialize,
+        D: serde::Deserializer<'de>,
    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-    ) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_struct<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_variant<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStructVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
+        deserializer.deserialize_str(self)
    }
 }

-pub struct SeqSerializer<'a, W> {
-    serializer: &'a mut DocumentSerializer<W>,
-}
+impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
+    type Value = FieldId;

-impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> {
-    type Ok = ();
-    type Error = Error;
-
-    fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
    where
-        T: Serialize,
+        E: serde::de::Error,
    {
-        value.serialize(&mut *self.serializer)?;
-        Ok(())
+        Ok(self.0.insert(v))
    }

-    fn end(self) -> Result<Self::Ok, Self::Error> {
-        Ok(())
+    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "a string")
    }
 }

-pub struct MapSerializer<'a, W> {
-    map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>,
-    index: &'a mut DocumentsBatchIndex,
-    writer: W,
-    mapped_documents: BTreeMap<FieldId, Value>,
+struct ValueDeserializer;
+
+impl<'de> DeserializeSeed<'de> for ValueDeserializer {
+    type Value = serde_json::Value;
+
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        serde_json::Value::deserialize(deserializer)
+    }
 }

-/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and
-/// serialize_value, therefore these to methods remain unimplemented.
-impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> {
-    type Ok = ();
-    type Error = Error;
+pub struct DocumentVisitor<'a, W> {
+    pub inner: &'a mut ByteCounter<W>,
+    pub index: &'a mut DocumentsBatchIndex,
+    pub obkv_buffer: &'a mut Vec<u8>,
+    pub value_buffer: &'a mut Vec<u8>,
+    pub values: &'a mut BTreeMap<FieldId, Value>,
+    pub count: &'a mut usize,
+}

-    fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> {
-        unreachable!()
-    }
+impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
+    /// This Visitor value is nothing, since it write the value to a file.
+    type Value = Result<(), Error>;

-    fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> {
-        unreachable!()
-    }
-
-    fn end(mut self) -> Result<Self::Ok, Self::Error> {
-        let mut buf = Vec::new();
-        for (key, value) in self.mapped_documents {
-            buf.clear();
-            let mut cursor = Cursor::new(&mut buf);
-            serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?;
-            self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?;
+    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+    where
+        A: SeqAccess<'de>,
+    {
+        while let Some(v) = seq.next_element_seed(&mut *self)? {
+            tri!(v)
        }

-        let data = self.map.into_inner().map_err(Error::Io)?.into_inner();
-        let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?;
-
-        self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?;
-        self.writer.write_all(&data).map_err(Error::Io)?;
-
-        Ok(())
+        Ok(Ok(()))
    }

-    fn serialize_entry<K: ?Sized, V: ?Sized>(
-        &mut self,
-        key: &K,
-        value: &V,
-    ) -> Result<(), Self::Error>
+    fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
    where
-        K: Serialize,
-        V: Serialize,
+        A: MapAccess<'de>,
    {
-        let field_serializer = FieldSerializer { index: &mut self.index };
-        let field_id: FieldId = key.serialize(field_serializer)?;
+        while let Some((key, value)) =
+            map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
+        {
+            self.values.insert(key, value);
+        }

-        let value = serde_json::to_value(value).map_err(Error::JsonError)?;
+        self.obkv_buffer.clear();
+        let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer));
+        for (key, value) in self.values.iter() {
+            self.value_buffer.clear();
+            // This is guaranteed to work
+            tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value));
+            tri!(obkv.insert(*key, &self.value_buffer));
+        }

-        self.mapped_documents.insert(field_id, value);
+        let reader = tri!(obkv.into_inner()).into_inner();

-        Ok(())
+        tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32));
+        tri!(self.inner.write_all(reader));
+
+        *self.count += 1;
+        self.values.clear();
+
+        Ok(Ok(()))
+    }
+
+    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "a documents, or a sequence of documents.")
    }
 }

-struct FieldSerializer<'a> {
-    index: &'a mut DocumentsBatchIndex,
-}
+impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
+where
+    W: Write,
+{
+    type Value = Result<(), Error>;

-impl<'a> serde::Serializer for FieldSerializer<'a> {
-    type Ok = FieldId;
-
-    type Error = Error;
-
-    type SerializeSeq = Impossible<FieldId, Self::Error>;
-    type SerializeTuple = Impossible<FieldId, Self::Error>;
-    type SerializeTupleStruct = Impossible<FieldId, Self::Error>;
-    type SerializeTupleVariant = Impossible<FieldId, Self::Error>;
-    type SerializeMap = Impossible<FieldId, Self::Error>;
-    type SerializeStruct = Impossible<FieldId, Self::Error>;
-    type SerializeStructVariant = Impossible<FieldId, Self::Error>;
-
-    fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> {
-        let field_id = match self.index.get_by_right(ws) {
-            Some(field_id) => *field_id,
-            None => {
-                let field_id = self.index.len() as FieldId;
-                self.index.insert(field_id, ws.to_string());
-                field_id
-            }
-        };
-
-        Ok(field_id)
-    }
-
-    fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
    where
-        T: Serialize,
+        D: serde::Deserializer<'de>,
    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-    ) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_struct<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_variant<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStructVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-}
-
-impl serde::ser::Error for Error {
-    fn custom<T: fmt::Display>(msg: T) -> Self {
-        Error::Custom(msg.to_string())
+        deserializer.deserialize_map(self)
    }
 }
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::BTreeSet;
 use std::convert::Infallible;
 use std::error::Error as StdError;
 use std::{fmt, io, str};
@ -57,19 +57,17 @@ pub enum UserError {
    CriterionError(CriterionError),
    DocumentLimitReached,
    InvalidDocumentId { document_id: Value },
-    InvalidFacetsDistribution { invalid_facets_name: HashSet<String> },
+    InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
    InvalidGeoField { document_id: Value, object: Value },
    InvalidFilter(String),
-    InvalidSortName { name: String },
-    InvalidSortableAttribute { field: String, valid_fields: HashSet<String> },
+    InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
    SortRankingRuleMissing,
    InvalidStoreFile,
    MaxDatabaseSizeReached,
-    MissingDocumentId { document: Object },
+    MissingDocumentId { primary_key: String, document: Object },
    MissingPrimaryKey,
    NoSpaceLeftOnDevice,
-    PrimaryKeyCannotBeChanged,
-    PrimaryKeyCannotBeReset,
+    PrimaryKeyCannotBeChanged(String),
    SerdeJson(serde_json::Error),
    SortError(SortError),
    UnknownInternalDocumentId { document_id: DocumentId },
@ -168,7 +166,7 @@ impl From<SerializationError> for Error {
 impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
-            Self::InternalError(error) => write!(f, "internal: {}", error),
+            Self::InternalError(error) => write!(f, "internal: {}.", error),
            Self::IoError(error) => error.fmt(f),
            Self::UserError(error) => error.fmt(f),
        }
@ -181,15 +179,15 @@ impl fmt::Display for InternalError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            Self::DatabaseMissingEntry { db_name, key } => {
-                write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name)
+                write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name)
            }
            Self::FieldIdMapMissingEntry(error) => error.fmt(f),
            Self::Fst(error) => error.fmt(f),
            Self::GrenadInvalidCompressionType => {
-                f.write_str("invalid compression type have been specified to grenad")
+                f.write_str("Invalid compression type have been specified to grenad.")
            }
            Self::IndexingMergingKeys { process } => {
-                write!(f, "invalid merge while processing {}", process)
+                write!(f, "Invalid merge while processing {}.", process)
            }
            Self::Serialization(error) => error.fmt(f),
            Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
@ -207,69 +205,75 @@ impl StdError for InternalError {}
 impl fmt::Display for UserError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
-            Self::InvalidFilter(input) => write!(f, "{}", input),
-            Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"),
+            Self::InvalidFilter(error) => f.write_str(error),
+            Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."),
            Self::CriterionError(error) => write!(f, "{}", error),
-            Self::DocumentLimitReached => f.write_str("maximum number of documents reached"),
+            Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."),
            Self::InvalidFacetsDistribution { invalid_facets_name } => {
                let name_list =
                    invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
                write!(
                    f,
-                    "invalid facet distribution, the fields {} are not set as filterable",
+                    "Invalid facet distribution, the fields `{}` are not set as filterable.",
                    name_list
                )
            }
-            Self::InvalidGeoField { document_id, object } => write!(
-                f,
-                "the document with the id: {} contains an invalid _geo field: {}",
-                document_id, object
-            ),
-            Self::InvalidDocumentId { document_id } => {
-                let json = serde_json::to_string(document_id).unwrap();
+            Self::InvalidGeoField { document_id, object } => {
+                let document_id = match document_id {
+                    Value::String(id) => id.clone(),
+                    _ => document_id.to_string(),
+                };
+                let object = match object {
+                    Value::String(id) => id.clone(),
+                    _ => object.to_string(),
+                };
                write!(
                    f,
-                    "document identifier is invalid {}, \
-a document id can be of type integer or string \
-only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)",
-                    json
+                    "The document with the id: `{}` contains an invalid _geo field: `{}`.",
+                    document_id, object
+                )
+            },
+            Self::InvalidDocumentId { document_id } => {
+                let document_id = match document_id {
+                    Value::String(id) => id.clone(),
+                    _ => document_id.to_string(),
+                };
+                write!(
+                    f,
+                    "Document identifier `{}` is invalid. \
+A document identifier can be of type integer or string, \
+only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).",
+                    document_id
                )
-            }
-            Self::InvalidSortName { name } => {
-                write!(f, "Invalid syntax for the sort parameter: {}", name)
            }
            Self::InvalidSortableAttribute { field, valid_fields } => {
                let valid_names =
                    valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
                write!(
                    f,
-                    "Attribute {} is not sortable, available sortable attributes are: {}",
+                    "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.",
                    field, valid_names
                )
            }
            Self::SortRankingRuleMissing => f.write_str(
-                "You must specify where \"sort\" is listed in the \
-rankingRules setting to use the sort parameter at search time",
+                "The sort ranking rule must be specified in the \
+ranking rules settings to use the sort parameter at search time.",
            ),
-            Self::MissingDocumentId { document } => {
+            Self::MissingDocumentId { primary_key, document } => {
                let json = serde_json::to_string(document).unwrap();
-                write!(f, "document doesn't have an identifier {}", json)
+                write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json)
            }
-            Self::MissingPrimaryKey => f.write_str("missing primary key"),
-            Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"),
-            // TODO where can we find it instead of writing the text ourselves?
-            Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"),
-            Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
-            Self::PrimaryKeyCannotBeChanged => {
-                f.write_str("primary key cannot be changed if the database contains documents")
-            }
-            Self::PrimaryKeyCannotBeReset => {
-                f.write_str("primary key cannot be reset if the database contains documents")
+            Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."),
+            Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."),
+            Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."),
+            Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."),
+            Self::PrimaryKeyCannotBeChanged(primary_key) => {
+                write!(f, "Index already has a primary key: `{}`.", primary_key)
            }
            Self::SerdeJson(error) => error.fmt(f),
            Self::SortError(error) => write!(f, "{}", error),
            Self::UnknownInternalDocumentId { document_id } => {
-                write!(f, "an unknown internal document id have been used ({})", document_id)
+                write!(f, "An unknown internal document id have been used: `{}`.", document_id)
            }
        }
    }
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -953,6 +953,7 @@ pub(crate) mod tests {
            { "id": 1, "name": "kevin", "has_dog": true },
            { "id": 2, "name": "bob" }
        ]);
+
        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(&mut wtxn, &index, 0);
        builder.execute(content, |_, _| ()).unwrap();
--- a/milli/src/search/distinct/mod.rs
+++ b/milli/src/search/distinct/mod.rs
@ -68,7 +68,9 @@ mod test {
                "txts": sample_txts[..(rng.gen_range(0..3))],
                "cat-ints": sample_ints[..(rng.gen_range(0..3))],
            });
-            builder.add_documents(doc).unwrap();
+
+            let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
+            builder.extend_from_json(doc).unwrap();
        }

        builder.finish().unwrap();
--- a/milli/src/search/facet/grammar.pest
+++ b/milli/src/search/facet/grammar.pest
@ -0,0 +1,33 @@
+key = _{reserved | quoted | word }
+value = _{quoted | word }
+quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP  }
+string = {char*}
+word = ${(LETTER | NUMBER | "_" | "-" | ".")+}
+
+char =  _{ !(PEEK | "\\") ~ ANY
+    | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t")
+    | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})}
+
+reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" }
+// we deliberately choose to allow empty parameters to generate more specific error message later
+parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""}
+condition = _{between | eq | greater | less | geq | leq | neq}
+between = {key ~ value ~ "TO" ~ value}
+geq = {key ~ ">=" ~ value}
+leq = {key ~ "<=" ~ value}
+neq = {key ~ "!=" ~ value}
+eq = {key ~ "=" ~ value}
+greater = {key ~ ">" ~ value}
+less = {key ~ "<" ~ value}
+geo_radius = {"_geoRadius" ~ parameters }
+
+prgm = {SOI ~ expr ~ EOI}
+expr = _{ ( term ~ (operation ~ term)* ) }
+term = { ("(" ~ expr ~ ")") | condition | not | geo_radius }
+operation = _{ and | or }
+and = {"AND"}
+or = {"OR"}
+
+not = {"NOT" ~ term}
+
+WHITESPACE = _{ " " }
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -151,13 +151,13 @@ impl<'a> Search<'a> {
                    Member::Field(ref field) if !sortable_fields.contains(field) => {
                        return Err(UserError::InvalidSortableAttribute {
                            field: field.to_string(),
-                            valid_fields: sortable_fields,
+                            valid_fields: sortable_fields.into_iter().collect(),
                        })?
                    }
                    Member::Geo(_) if !sortable_fields.contains("_geo") => {
                        return Err(UserError::InvalidSortableAttribute {
                            field: "_geo".to_string(),
-                            valid_fields: sortable_fields,
+                            valid_fields: sortable_fields.into_iter().collect(),
                        })?
                    }
                    _ => (),
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -877,7 +877,8 @@ mod tests {
        let mut cursor = Cursor::new(Vec::new());

        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(big_object).unwrap();
+        let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
+        builder.extend_from_json(big_object).unwrap();
        builder.finish().unwrap();
        cursor.set_position(0);
        let content = DocumentBatchReader::from_reader(cursor).unwrap();
@ -905,8 +906,9 @@ mod tests {

        let mut cursor = Cursor::new(Vec::new());

+        let big_object = serde_json::to_string(&big_object).unwrap();
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(big_object).unwrap();
+        builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
        builder.finish().unwrap();
        cursor.set_position(0);
        let content = DocumentBatchReader::from_reader(cursor).unwrap();
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -75,7 +75,7 @@ fn create_fields_mapping(
        .collect()
 }

-fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> {
+fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
    index
        .iter()
        .sorted_by_key(|(k, _)| *k)
@ -179,7 +179,7 @@ impl Transform<'_, '_> {
                        if !self.autogenerate_docids {
                            let mut json = Map::new();
                            for (key, value) in document.iter() {
-                                let key = addition_index.get_by_left(&key).cloned();
+                                let key = addition_index.name(key).cloned();
                                let value = serde_json::from_slice::<Value>(&value).ok();

                                if let Some((k, v)) = key.zip(value) {
@ -187,7 +187,11 @@ impl Transform<'_, '_> {
                                }
                            }

-                            return Err(UserError::MissingDocumentId { document: json }.into());
+                            return Err(UserError::MissingDocumentId {
+                                primary_key: primary_key_name,
+                                document: json,
+                            }
+                            .into());
                        }

                        let uuid =
@ -544,6 +548,7 @@ mod test {
    mod primary_key_inference {
        use bimap::BiHashMap;

+        use crate::documents::DocumentsBatchIndex;
        use crate::update::index_documents::transform::find_primary_key;

        #[test]
@ -557,7 +562,7 @@ mod test {
                map.insert(4, "fakeId".to_string());
                map.insert(0, "realId".to_string());

-                assert_eq!(find_primary_key(&map), Some("realId"));
+                assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
            }
        }
    }
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    self.index.put_primary_key(self.wtxn, primary_key)?;
                    Ok(())
                } else {
-                    Err(UserError::PrimaryKeyCannotBeChanged.into())
+                    let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
+                    Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
                }
            }
            Setting::Reset => {
@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    self.index.delete_primary_key(self.wtxn)?;
                    Ok(())
                } else {
-                    Err(UserError::PrimaryKeyCannotBeReset.into())
+                    let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
+                    Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
                }
            }
            Setting::NotSet => Ok(()),
@ -1106,7 +1108,7 @@ mod tests {
        builder.reset_primary_key();

        let err = builder.execute(|_, _| ()).unwrap_err();
-        assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset)));
+        assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_))));
        wtxn.abort().unwrap();

        // But if we clear the database...
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@ -61,9 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
    let mut cursor = Cursor::new(Vec::new());
    let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
    let reader = Cursor::new(CONTENT.as_bytes());
+
    for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
-        documents_builder.add_documents(doc.unwrap()).unwrap();
+        let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
+        documents_builder.extend_from_json(doc).unwrap();
    }
+
    documents_builder.finish().unwrap();

    cursor.set_position(0);
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@ -409,7 +409,8 @@ fn criteria_ascdesc() {
            "age": age,
        });

-        batch_builder.add_documents(json).unwrap();
+        let json = Cursor::new(serde_json::to_vec(&json).unwrap());
+        batch_builder.extend_from_json(json).unwrap();
    });

    batch_builder.finish().unwrap();
--- a/search/Cargo.toml
+++ b/search/Cargo.toml
@ -1,18 +0,0 @@
-[package]
-name = "search"
-version = "0.18.0"
-authors = ["Clément Renault <clement@meilisearch.com>"]
-edition = "2018"
-
-[dependencies]
-anyhow = "1.0.38"
-byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
-heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
-log = "0.4.14"
-milli = { path = "../milli" }
-serde_json = "1.0.62"
-stderrlog = "0.5.1"
-structopt = { version = "0.3.21", default-features = false }
-
-[target.'cfg(target_os = "linux")'.dependencies]
-jemallocator = "0.3.2"