merge with main

2025-07-04 20:37:15 +02:00 · 2021-11-06 16:34:30 +01:00 · 2021-11-06 16:34:30 +01:00 · 6831c23449
commit 6831c23449
parent 5c01e9bf7c a2fc74f010
25 changed files with 824 additions and 953 deletions
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@ -1,7 +1,7 @@
 #![allow(dead_code)]
 use std::fs::{create_dir_all, remove_dir_all, File};
-use std::io::{self, Cursor, Read, Seek};
+use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
 use std::num::ParseFloatError;
 use std::path::Path;
@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
    DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
 }
-fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-    let values = serde_json::Deserializer::from_reader(reader)
+    let mut buf = String::new();
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
+    let mut reader = BufReader::new(reader);
-    for document in values {
+
-        let document = document?;
+    while reader.read_line(&mut buf)? > 0 {
-        documents.add_documents(document)?;
+        documents.extend_from_json(&mut buf.as_bytes())?;
    }
    documents.finish()?;
    Ok(writer.into_inner())
 }
-fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-    let json: serde_json::Value = serde_json::from_reader(reader)?;
+    documents.extend_from_json(reader)?;
    documents.add_documents(json)?;
    documents.finish()?;
    Ok(writer.into_inner())
 }
-fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
    let iter = CSVDocumentDeserializer::from_reader(reader)?;
    for doc in iter {
        let doc = doc?;
        documents.add_documents(doc)?;
    }
    documents.finish()?;
    Ok(writer.into_inner())
 }
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "cli"
-version = "0.1.0"
+version = "0.20.2"
 edition = "2018"
 description = "A CLI to interact with a milli index"
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io::{stdin, Cursor, Read};
+use std::io::{stdin, BufRead, BufReader, Cursor, Read};
 use std::path::PathBuf;
 use std::str::FromStr;
@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use milli::update::UpdateIndexingStep::{
    ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
 };
 use serde_json::{Map, Value};
 use structopt::StructOpt;
 #[cfg(target_os = "linux")]
@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-    let values = serde_json::Deserializer::from_reader(reader)
+    let mut buf = String::new();
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
+    let mut reader = BufReader::new(reader);
-    for document in values {
+
-        let document = document?;
+    while reader.read_line(&mut buf)? > 0 {
-        documents.add_documents(document)?;
+        documents.extend_from_json(&mut buf.as_bytes())?;
    }
    documents.finish()?;
@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-    let json: serde_json::Value = serde_json::from_reader(reader)?;
+    documents.extend_from_json(reader)?;
    documents.add_documents(json)?;
    documents.finish()?;
    Ok(writer.into_inner())
@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
 fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
    let mut records = csv::Reader::from_reader(reader);
    let iter = records.deserialize::<Map<String, Value>>();
    for doc in iter {
        let doc = doc?;
        documents.add_documents(doc)?;
    }
    documents.finish()?;
    Ok(writer.into_inner())
 }
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
--- a/http-ui/src/documents_from_csv.rs
+++ b/http-ui/src/documents_from_csv.rs
@ -1,285 +0,0 @@
 use std::io::{Read, Result as IoResult};
 use std::num::ParseFloatError;
 use serde_json::{Map, Value};
 enum AllowedType {
    String,
    Number,
 }
 fn parse_csv_header(header: &str) -> (String, AllowedType) {
    // if there are several separators we only split on the last one.
    match header.rsplit_once(':') {
        Some((field_name, field_type)) => match field_type {
            "string" => (field_name.to_string(), AllowedType::String),
            "number" => (field_name.to_string(), AllowedType::Number),
            // we may return an error in this case.
            _otherwise => (header.to_string(), AllowedType::String),
        },
        None => (header.to_string(), AllowedType::String),
    }
 }
 pub struct CSVDocumentDeserializer<R>
 where
    R: Read,
 {
    documents: csv::StringRecordsIntoIter<R>,
    headers: Vec<(String, AllowedType)>,
 }
 impl<R: Read> CSVDocumentDeserializer<R> {
    pub fn from_reader(reader: R) -> IoResult<Self> {
        let mut records = csv::Reader::from_reader(reader);
        let headers = records.headers()?.into_iter().map(parse_csv_header).collect();
        Ok(Self { documents: records.into_records(), headers })
    }
 }
 impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
    type Item = anyhow::Result<Map<String, Value>>;
    fn next(&mut self) -> Option<Self::Item> {
        let csv_document = self.documents.next()?;
        match csv_document {
            Ok(csv_document) => {
                let mut document = Map::new();
                for ((field_name, field_type), value) in
                    self.headers.iter().zip(csv_document.into_iter())
                {
                    let parsed_value: Result<Value, ParseFloatError> = match field_type {
                        AllowedType::Number => {
                            value.parse::<f64>().map(Value::from).map_err(Into::into)
                        }
                        AllowedType::String => Ok(Value::String(value.to_string())),
                    };
                    match parsed_value {
                        Ok(value) => drop(document.insert(field_name.to_string(), value)),
                        Err(_e) => {
                            return Some(Err(anyhow::anyhow!(
                                "Value '{}' is not a valid number",
                                value
                            )))
                        }
                    }
                }
                Some(Ok(document))
            }
            Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))),
        }
    }
 }
 #[cfg(test)]
 mod test {
    use serde_json::json;
    use super::*;
    #[test]
    fn simple_csv_document() {
        let documents = r#"city,country,pop
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn coma_in_field() {
        let documents = r#"city,country,pop
 "Boston","United, States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city": "Boston",
                "country": "United, States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn quote_in_field() {
        let documents = r#"city,country,pop
 "Boston","United"" States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city": "Boston",
                "country": "United\" States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn integer_in_field() {
        let documents = r#"city,country,pop:number
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city": "Boston",
                "country": "United States",
                "pop": 4628910.0,
            })
        );
    }
    #[test]
    fn float_in_field() {
        let documents = r#"city,country,pop:number
 "Boston","United States","4628910.01""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city": "Boston",
                "country": "United States",
                "pop": 4628910.01,
            })
        );
    }
    #[test]
    fn several_double_dot_in_header() {
        let documents = r#"city:love:string,country:state,pop
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city:love": "Boston",
                "country:state": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn ending_by_double_dot_in_header() {
        let documents = r#"city:,country,pop
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city:": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn starting_by_double_dot_in_header() {
        let documents = r#":city,country,pop
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                ":city": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn starting_by_double_dot_in_header2() {
        let documents = r#":string,country,pop
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn double_double_dot_in_header() {
        let documents = r#"city::string,country,pop
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert_eq!(
            Value::Object(csv_iter.next().unwrap().unwrap()),
            json!({
                "city:": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn bad_type_in_header() {
        let documents = r#"city,country:number,pop
 "Boston","United States","4628910""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert!(csv_iter.next().unwrap().is_err());
    }
    #[test]
    fn bad_column_count1() {
        let documents = r#"city,country,pop
 "Boston","United States","4628910", "too much""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert!(csv_iter.next().unwrap().is_err());
    }
    #[test]
    fn bad_column_count2() {
        let documents = r#"city,country,pop
 "Boston","United States""#;
        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
        assert!(csv_iter.next().unwrap().is_err());
    }
 }
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@ -1,10 +1,9 @@
 mod documents_from_csv;
 mod update_store;
 use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fmt::Display;
 use std::fs::{create_dir_all, File};
-use std::io::Cursor;
+use std::io::{BufRead, BufReader, Cursor};
 use std::net::SocketAddr;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::PathBuf;
@ -40,7 +39,6 @@ use warp::http::Response;
 use warp::Filter;
 use self::update_store::UpdateStore;
 use crate::documents_from_csv::CSVDocumentDeserializer;
 #[cfg(target_os = "linux")]
 #[global_allocator]
@ -1048,11 +1046,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-    let values = serde_json::Deserializer::from_reader(reader)
+    let mut buf = String::new();
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
+    let mut reader = BufReader::new(reader);
-    for document in values {
+
-        let document = document?;
+    while reader.read_line(&mut buf)? > 0 {
-        documents.add_documents(document)?;
+        documents.extend_from_json(&mut buf.as_bytes())?;
    }
    documents.finish()?;
@ -1063,8 +1061,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-    let json: serde_json::Value = serde_json::from_reader(reader)?;
+    documents.extend_from_json(reader)?;
    documents.add_documents(json)?;
    documents.finish()?;
    Ok(writer.into_inner())
@ -1072,16 +1069,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
 fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
    let iter = CSVDocumentDeserializer::from_reader(reader)?;
    for doc in iter {
        let doc = doc?;
        documents.add_documents(doc)?;
    }
    documents.finish()?;
    Ok(writer.into_inner())
 }
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.18.0"
+version = "0.20.2"
 authors = ["Kerollmops <clement@meilisearch.com>"]
 edition = "2018"
@ -46,6 +46,7 @@ itertools = "0.10.0"
 # logging
 log = "0.4.14"
 logging_timer = "1.0.0"
 csv = "1.1.6"
 [dev-dependencies]
 big_s = "1.0.2"
--- a/milli/src/asc_desc.rs
+++ b/milli/src/asc_desc.rs
@ -28,12 +28,12 @@ impl fmt::Display for AscDescError {
                write!(f, "Longitude must be contained between -180 and 180 degrees.",)
            }
            Self::InvalidSyntax { name } => {
-                write!(f, "invalid asc/desc syntax for {}.", name)
+                write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
            }
            Self::ReservedKeyword { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a asc/desc rule.",
+                    "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.",
                    name
                )
            }
@ -192,18 +192,18 @@ impl fmt::Display for SortError {
            Self::BadGeoPointUsage { name } => {
                write!(
                    f,
-                    "invalid syntax for the `_geoPoint` parameter: `{}`. \
+                    "Invalid syntax for the geo parameter: expected expression formated like \
-                    Usage: `_geoPoint(latitude, longitude):asc`.",
+                    `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.",
                    name
                )
            }
            Self::InvalidName { name } => {
-                write!(f, "invalid syntax for the sort parameter `{}`.", name)
+                write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
            }
            Self::ReservedName { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a sort expression.",
+                    "`{}` is a reserved keyword and thus can't be used as a sort expression.",
                    name
                )
            }
@ -211,7 +211,7 @@ impl fmt::Display for SortError {
                write!(
                    f,
                    "`{}` is a reserved keyword and thus can't be used as a sort expression. \
-                    Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.",
+                    Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.",
                    name,
                )
            }
@ -318,50 +318,4 @@ mod tests {
            );
        }
    }
    #[test]
    fn sort_error_message() {
        let errors = [
            (
                AscDescError::InvalidSyntax { name: S("truc:machin") },
                S("invalid syntax for the sort parameter `truc:machin`."),
            ),
            (
                AscDescError::InvalidSyntax { name: S("hello:world") },
                S("invalid syntax for the sort parameter `hello:world`."),
            ),
            (
                AscDescError::ReservedKeyword { name: S("_geo") },
                S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
            ),
            (
                AscDescError::ReservedKeyword { name: S("_geoDistance") },
                S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.")
            ),
            (
                AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") },
                S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
            ),
            (
                AscDescError::InvalidLatitude,
                S("Latitude must be contained between -90 and 90 degrees."),
            ),
            (
                AscDescError::InvalidLongitude,
                S("Longitude must be contained between -180 and 180 degrees."),
            ),
        ];
        for (asc_desc_error, expected_message) in errors {
            let sort_error = SortError::from(asc_desc_error);
            assert_eq!(
                sort_error.to_string(),
                expected_message,
                "was expecting {} for the error {:?} but instead got {}",
                expected_message,
                sort_error,
                sort_error.to_string()
            );
        }
    }
 }
--- a/milli/src/criterion.rs
+++ b/milli/src/criterion.rs
@ -17,23 +17,27 @@ pub enum CriterionError {
 impl fmt::Display for CriterionError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
-            Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name),
+            Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name),
            Self::ReservedName { name } => {
-                write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name)
+                write!(
                    f,
                    "`{}` is a reserved keyword and thus can't be used as a ranking rule",
                    name
                )
            }
            Self::ReservedNameForSort { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a ranking rule. \
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule. \
-{} can only be used for sorting at search time",
+`{}` can only be used for sorting at search time",
                    name, name
                )
            }
            Self::ReservedNameForFilter { name } => {
                write!(
                    f,
-                    "{} is a reserved keyword and thus can't be used as a ranking rule. \
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule. \
-{} can only be used for filtering at search time",
+`{}` can only be used for filtering at search time",
                    name, name
                )
            }
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@ -1,16 +1,20 @@
 use std::collections::BTreeMap;
 use std::io;
 use std::io::{Cursor, Write};
 use byteorder::{BigEndian, WriteBytesExt};
-use serde::ser::Serialize;
+use serde::Deserializer;
 use serde_json::Value;
-use super::serde::DocumentSerializer;
+use super::serde::DocumentVisitor;
 use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 use crate::FieldId;
 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
 /// format used by milli.
 ///
 /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
-/// iterate other the documents.
+/// iterate over the documents.
 ///
 /// ## example:
 /// ```
@ -18,43 +22,48 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 /// use serde_json::json;
 /// use std::io::Cursor;
 ///
 /// let json = r##"{"id": 1, "name": "foo"}"##;
 /// let mut writer = Cursor::new(Vec::new());
 /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
-/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap();
+/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
 /// builder.finish().unwrap();
 /// ```
 pub struct DocumentBatchBuilder<W> {
-    serializer: DocumentSerializer<W>,
+    inner: ByteCounter<W>,
    index: DocumentsBatchIndex,
    obkv_buffer: Vec<u8>,
    value_buffer: Vec<u8>,
    values: BTreeMap<FieldId, Value>,
    count: usize,
 }
 impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
    pub fn new(writer: W) -> Result<Self, Error> {
-        let index = DocumentsBatchIndex::new();
+        let index = DocumentsBatchIndex::default();
        let mut writer = ByteCounter::new(writer);
        // add space to write the offset of the metadata at the end of the writer
        writer.write_u64::<BigEndian>(0)?;
-        let serializer =
+        Ok(Self {
-            DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true };
+            inner: writer,
-
+            index,
-        Ok(Self { serializer })
+            obkv_buffer: Vec::new(),
            value_buffer: Vec::new(),
            values: BTreeMap::new(),
            count: 0,
        })
    }
    /// Returns the number of documents that have been written to the builder.
    pub fn len(&self) -> usize {
-        self.serializer.count
+        self.count
    }
    /// This method must be called after the document addition is terminated. It will put the
    /// metadata at the end of the file, and write the metadata offset at the beginning on the
    /// file.
-    pub fn finish(self) -> Result<(), Error> {
+    pub fn finish(self) -> Result<usize, Error> {
-        let DocumentSerializer {
+        let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
            writer: ByteCounter { mut writer, count: offset },
            index,
            count,
            ..
        } = self.serializer;
        let meta = DocumentsMetadata { count, index };
@ -65,16 +74,481 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
        writer.flush()?;
-        Ok(())
+        Ok(count)
    }
-    /// Adds documents to the builder.
+    /// Extends the builder with json documents from a reader.
    pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
        let mut de = serde_json::Deserializer::from_reader(reader);
        let mut visitor = DocumentVisitor {
            inner: &mut self.inner,
            index: &mut self.index,
            obkv_buffer: &mut self.obkv_buffer,
            value_buffer: &mut self.value_buffer,
            values: &mut self.values,
            count: &mut self.count,
        };
        de.deserialize_any(&mut visitor).map_err(Error::JsonError)?
    }
    /// Creates a builder from a reader of CSV documents.
    ///
-    /// The internal index is updated with the fields found
+    /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
-    /// in the documents. Document must either be a map or a sequences of map, anything else will
+    /// optimisations, and extending from another CSV is not allowed.
-    /// fail.
+    pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
-    pub fn add_documents<T: Serialize>(&mut self, document: T) -> Result<(), Error> {
+        let mut this = Self::new(writer)?;
-        document.serialize(&mut self.serializer)?;
+        // Ensure that this is the first and only addition made with this builder
-        Ok(())
+        debug_assert!(this.index.is_empty());
        let mut records = csv::Reader::from_reader(reader);
        let headers = records
            .headers()?
            .into_iter()
            .map(parse_csv_header)
            .map(|(k, t)| (this.index.insert(&k), t))
            .collect::<BTreeMap<_, _>>();
        for (i, record) in records.into_records().enumerate() {
            let record = record?;
            this.obkv_buffer.clear();
            let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
            for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
                let value = match ty {
                    AllowedType::Number => {
                        value.parse::<f64>().map(Value::from).map_err(|error| {
                            Error::ParseFloat {
                                error,
                                // +1 for the header offset.
                                line: i + 1,
                                value: value.to_string(),
                            }
                        })?
                    }
                    AllowedType::String => Value::String(value.to_string()),
                };
                this.value_buffer.clear();
                serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
                writer.insert(*fid, &this.value_buffer)?;
            }
            this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
            this.inner.write_all(&this.obkv_buffer)?;
            this.count += 1;
        }
        Ok(this)
    }
 }
 #[derive(Debug)]
 enum AllowedType {
    String,
    Number,
 }
 fn parse_csv_header(header: &str) -> (String, AllowedType) {
    // if there are several separators we only split on the last one.
    match header.rsplit_once(':') {
        Some((field_name, field_type)) => match field_type {
            "string" => (field_name.to_string(), AllowedType::String),
            "number" => (field_name.to_string(), AllowedType::Number),
            // if the pattern isn't reconized, we keep the whole field.
            _otherwise => (header.to_string(), AllowedType::String),
        },
        None => (header.to_string(), AllowedType::String),
    }
 }
 #[cfg(test)]
 mod test {
    use std::io::Cursor;
    use serde_json::{json, Map};
    use super::*;
    use crate::documents::DocumentBatchReader;
    fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
        let mut map = Map::new();
        for (fid, value) in obkv.iter() {
            let field_name = index.name(fid).unwrap().clone();
            let value: Value = serde_json::from_slice(value).unwrap();
            map.insert(field_name, value);
        }
        Value::Object(map)
    }
    #[test]
    fn add_single_documents_json() {
        let mut cursor = Cursor::new(Vec::new());
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
        let json = serde_json::json!({
            "id": 1,
            "field": "hello!",
        });
        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
        let json = serde_json::json!({
            "blabla": false,
            "field": "hello!",
            "id": 1,
        });
        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
        assert_eq!(builder.len(), 2);
        builder.finish().unwrap();
        cursor.set_position(0);
        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
        assert_eq!(index.len(), 3);
        assert_eq!(document.iter().count(), 2);
        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
        assert_eq!(index.len(), 3);
        assert_eq!(document.iter().count(), 3);
        assert!(reader.next_document_with_index().unwrap().is_none());
    }
    #[test]
    fn add_documents_seq_json() {
        let mut cursor = Cursor::new(Vec::new());
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
        let json = serde_json::json!([{
            "id": 1,
            "field": "hello!",
        },{
            "blabla": false,
            "field": "hello!",
            "id": 1,
        }
        ]);
        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
        assert_eq!(builder.len(), 2);
        builder.finish().unwrap();
        cursor.set_position(0);
        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
        assert_eq!(index.len(), 3);
        assert_eq!(document.iter().count(), 2);
        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
        assert_eq!(index.len(), 3);
        assert_eq!(document.iter().count(), 3);
        assert!(reader.next_document_with_index().unwrap().is_none());
    }
    #[test]
    fn add_documents_csv() {
        let mut cursor = Cursor::new(Vec::new());
        let csv = "id:number,field:string\n1,hello!\n2,blabla";
        let builder =
            DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
        builder.finish().unwrap();
        cursor.set_position(0);
        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
        assert_eq!(index.len(), 2);
        assert_eq!(document.iter().count(), 2);
        let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
        assert_eq!(document.iter().count(), 2);
        assert!(reader.next_document_with_index().unwrap().is_none());
    }
    #[test]
    fn simple_csv_document() {
        let documents = r#"city,country,pop
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
        assert!(reader.next_document_with_index().unwrap().is_none());
    }
    #[test]
    fn coma_in_field() {
        let documents = r#"city,country,pop
 "Boston","United, States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city": "Boston",
                "country": "United, States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn quote_in_field() {
        let documents = r#"city,country,pop
 "Boston","United"" States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city": "Boston",
                "country": "United\" States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn integer_in_field() {
        let documents = r#"city,country,pop:number
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city": "Boston",
                "country": "United States",
                "pop": 4628910.0,
            })
        );
    }
    #[test]
    fn float_in_field() {
        let documents = r#"city,country,pop:number
 "Boston","United States","4628910.01""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city": "Boston",
                "country": "United States",
                "pop": 4628910.01,
            })
        );
    }
    #[test]
    fn several_colon_in_header() {
        let documents = r#"city:love:string,country:state,pop
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city:love": "Boston",
                "country:state": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn ending_by_colon_in_header() {
        let documents = r#"city:,country,pop
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city:": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn starting_by_colon_in_header() {
        let documents = r#":city,country,pop
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                ":city": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[ignore]
    #[test]
    fn starting_by_colon_in_header2() {
        let documents = r#":string,country,pop
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        assert!(reader.next_document_with_index().is_err());
    }
    #[test]
    fn double_colon_in_header() {
        let documents = r#"city::string,country,pop
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
            .unwrap()
            .finish()
            .unwrap();
        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
        let val = obkv_to_value(&doc, index);
        assert_eq!(
            val,
            json!({
                "city:": "Boston",
                "country": "United States",
                "pop": "4628910",
            })
        );
    }
    #[test]
    fn bad_type_in_header() {
        let documents = r#"city,country:number,pop
 "Boston","United States","4628910""#;
        let mut buf = Vec::new();
        assert!(
            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
        );
    }
    #[test]
    fn bad_column_count1() {
        let documents = r#"city,country,pop
 "Boston","United States","4628910", "too much""#;
        let mut buf = Vec::new();
        assert!(
            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
        );
    }
    #[test]
    fn bad_column_count2() {
        let documents = r#"city,country,pop
 "Boston","United States""#;
        let mut buf = Vec::new();
        assert!(
            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
        );
    }
 }
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -7,7 +7,8 @@ mod builder;
 mod reader;
 mod serde;
-use std::{fmt, io};
+use std::fmt::{self, Debug};
 use std::io;
 use ::serde::{Deserialize, Serialize};
 use bimap::BiHashMap;
@ -17,7 +18,38 @@ pub use reader::DocumentBatchReader;
 use crate::FieldId;
 /// A bidirectional map that links field ids to their name in a document batch.
-pub type DocumentsBatchIndex = BiHashMap<FieldId, String>;
+#[derive(Default, Debug, Serialize, Deserialize)]
 pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
 impl DocumentsBatchIndex {
    /// Insert the field in the map, or return it's field id if it doesn't already exists.
    pub fn insert(&mut self, field: &str) -> FieldId {
        match self.0.get_by_right(field) {
            Some(field_id) => *field_id,
            None => {
                let field_id = self.0.len() as FieldId;
                self.0.insert(field_id, field.to_string());
                field_id
            }
        }
    }
    pub fn is_empty(&self) -> bool {
        self.0.is_empty()
    }
    pub fn len(&self) -> usize {
        self.0.len()
    }
    pub fn iter(&self) -> bimap::hash::Iter<FieldId, String> {
        self.0.iter()
    }
    pub fn name(&self, id: FieldId) -> Option<&String> {
        self.0.get_by_left(&id)
    }
 }
 #[derive(Debug, Serialize, Deserialize)]
 struct DocumentsMetadata {
@ -50,14 +82,22 @@ impl<W: io::Write> io::Write for ByteCounter<W> {
 #[derive(Debug)]
 pub enum Error {
    ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
    InvalidDocumentFormat,
    Custom(String),
    JsonError(serde_json::Error),
    CsvError(csv::Error),
    Serialize(bincode::Error),
    Io(io::Error),
    DocumentTooLarge,
 }
 impl From<csv::Error> for Error {
    fn from(e: csv::Error) -> Self {
        Self::CsvError(e)
    }
 }
 impl From<io::Error> for Error {
    fn from(other: io::Error) -> Self {
        Self::Io(other)
@ -70,15 +110,25 @@ impl From<bincode::Error> for Error {
    }
 }
 impl From<serde_json::Error> for Error {
    fn from(other: serde_json::Error) -> Self {
        Self::JsonError(other)
    }
 }
 impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Error::ParseFloat { error, line, value } => {
                write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
            }
            Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
            Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
            Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
-            Error::Io(e) => e.fmt(f),
+            Error::Io(e) => write!(f, "{}", e),
            Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
-            Error::Serialize(e) => e.fmt(f),
+            Error::Serialize(e) => write!(f, "{}", e),
            Error::CsvError(e) => write!(f, "{}", e),
        }
    }
 }
@ -92,7 +142,8 @@ macro_rules! documents {
        let documents = serde_json::json!($data);
        let mut writer = std::io::Cursor::new(Vec::new());
        let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
-        builder.add_documents(documents).unwrap();
+        let documents = serde_json::to_vec(&documents).unwrap();
        builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
        builder.finish().unwrap();
        writer.set_position(0);
@ -103,6 +154,8 @@ macro_rules! documents {
 #[cfg(test)]
 mod test {
    use std::io::Cursor;
    use serde_json::{json, Value};
    use super::*;
@ -119,12 +172,14 @@ mod test {
            "bool": true
        });
        let json = serde_json::to_vec(&json).unwrap();
        let mut v = Vec::new();
        let mut cursor = io::Cursor::new(&mut v);
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(json).unwrap();
+        builder.extend_from_json(Cursor::new(json)).unwrap();
        builder.finish().unwrap();
@ -148,13 +203,16 @@ mod test {
            "toto": false,
        });
        let doc1 = serde_json::to_vec(&doc1).unwrap();
        let doc2 = serde_json::to_vec(&doc2).unwrap();
        let mut v = Vec::new();
        let mut cursor = io::Cursor::new(&mut v);
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(doc1).unwrap();
+        builder.extend_from_json(Cursor::new(doc1)).unwrap();
-        builder.add_documents(doc2).unwrap();
+        builder.extend_from_json(Cursor::new(doc2)).unwrap();
        builder.finish().unwrap();
@ -177,12 +235,14 @@ mod test {
            { "tata": "hello" },
        ]);
        let docs = serde_json::to_vec(&docs).unwrap();
        let mut v = Vec::new();
        let mut cursor = io::Cursor::new(&mut v);
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(docs).unwrap();
+        builder.extend_from_json(Cursor::new(docs)).unwrap();
        builder.finish().unwrap();
@ -210,11 +270,13 @@ mod test {
            { "tata": "hello" },
        ]]);
-        assert!(builder.add_documents(docs).is_err());
+        let docs = serde_json::to_vec(&docs).unwrap();
        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
        let docs = json!("hello");
        let docs = serde_json::to_vec(&docs).unwrap();
-        assert!(builder.add_documents(docs).is_err());
+        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
    }
    #[test]
--- a/milli/src/documents/serde.rs
+++ b/milli/src/documents/serde.rs
@ -1,474 +1,134 @@
 use std::collections::BTreeMap;
-use std::convert::TryInto;
+use std::fmt;
-use std::io::Cursor;
+use std::io::{Cursor, Write};
 use std::{fmt, io};
-use byteorder::{BigEndian, WriteBytesExt};
+use byteorder::WriteBytesExt;
-use obkv::KvWriter;
+use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
-use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer};
+use serde::Deserialize;
 use serde_json::Value;
 use super::{ByteCounter, DocumentsBatchIndex, Error};
 use crate::FieldId;
-pub struct DocumentSerializer<W> {
+macro_rules! tri {
-    pub writer: ByteCounter<W>,
+    ($e:expr) => {
-    pub buffer: Vec<u8>,
+        match $e {
-    pub index: DocumentsBatchIndex,
+            Ok(r) => r,
-    pub count: usize,
+            Err(e) => return Ok(Err(e.into())),
    pub allow_seq: bool,
 }
 impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> {
    type Ok = ();
    type Error = Error;
    type SerializeSeq = SeqSerializer<'a, W>;
    type SerializeTuple = Impossible<(), Self::Error>;
    type SerializeTupleStruct = Impossible<(), Self::Error>;
    type SerializeTupleVariant = Impossible<(), Self::Error>;
    type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>;
    type SerializeStruct = Impossible<(), Self::Error>;
    type SerializeStructVariant = Impossible<(), Self::Error>;
    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
        self.buffer.clear();
        let cursor = io::Cursor::new(&mut self.buffer);
        self.count += 1;
        let map_serializer = MapSerializer {
            map: KvWriter::new(cursor),
            index: &mut self.index,
            writer: &mut self.writer,
            mapped_documents: BTreeMap::new(),
        };
        Ok(map_serializer)
    }
    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
        if self.allow_seq {
            // Only allow sequence of documents of depth 1.
            self.allow_seq = false;
            Ok(SeqSerializer { serializer: self })
        } else {
            Err(Error::InvalidDocumentFormat)
        }
-    }
+    };
 }
-    fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
+struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);
        Err(Error::InvalidDocumentFormat)
    }
-    fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
+impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
-        Err(Error::InvalidDocumentFormat)
+    type Value = FieldId;
    }
-    fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
    where
-        T: Serialize,
+        D: serde::Deserializer<'de>,
    {
-        Err(Error::InvalidDocumentFormat)
+        deserializer.deserialize_str(self)
    }
    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_unit_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
    ) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_newtype_struct<T: ?Sized>(
        self,
        _name: &'static str,
        _value: &T,
    ) -> Result<Self::Ok, Self::Error>
    where
        T: Serialize,
    {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_newtype_variant<T: ?Sized>(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _value: &T,
    ) -> Result<Self::Ok, Self::Error>
    where
        T: Serialize,
    {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_tuple_struct(
        self,
        _name: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_tuple_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_struct(
        self,
        _name: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeStruct, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_struct_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeStructVariant, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
 }
-pub struct SeqSerializer<'a, W> {
+impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
-    serializer: &'a mut DocumentSerializer<W>,
+    type Value = FieldId;
 }
-impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> {
+    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
    type Ok = ();
    type Error = Error;
    fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
    where
-        T: Serialize,
+        E: serde::de::Error,
    {
-        value.serialize(&mut *self.serializer)?;
+        Ok(self.0.insert(v))
        Ok(())
    }
-    fn end(self) -> Result<Self::Ok, Self::Error> {
+    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        Ok(())
+        write!(f, "a string")
    }
 }
-pub struct MapSerializer<'a, W> {
+struct ValueDeserializer;
-    map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>,
+
-    index: &'a mut DocumentsBatchIndex,
+impl<'de> DeserializeSeed<'de> for ValueDeserializer {
-    writer: W,
+    type Value = serde_json::Value;
-    mapped_documents: BTreeMap<FieldId, Value>,
+
    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        serde_json::Value::deserialize(deserializer)
    }
 }
-/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and
+pub struct DocumentVisitor<'a, W> {
-/// serialize_value, therefore these to methods remain unimplemented.
+    pub inner: &'a mut ByteCounter<W>,
-impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> {
+    pub index: &'a mut DocumentsBatchIndex,
-    type Ok = ();
+    pub obkv_buffer: &'a mut Vec<u8>,
-    type Error = Error;
+    pub value_buffer: &'a mut Vec<u8>,
    pub values: &'a mut BTreeMap<FieldId, Value>,
    pub count: &'a mut usize,
 }
-    fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> {
+impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
-        unreachable!()
+    /// This Visitor value is nothing, since it write the value to a file.
-    }
+    type Value = Result<(), Error>;
-    fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> {
+    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
-        unreachable!()
+    where
-    }
+        A: SeqAccess<'de>,
-
+    {
-    fn end(mut self) -> Result<Self::Ok, Self::Error> {
+        while let Some(v) = seq.next_element_seed(&mut *self)? {
-        let mut buf = Vec::new();
+            tri!(v)
        for (key, value) in self.mapped_documents {
            buf.clear();
            let mut cursor = Cursor::new(&mut buf);
            serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?;
            self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?;
        }
-        let data = self.map.into_inner().map_err(Error::Io)?.into_inner();
+        Ok(Ok(()))
        let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?;
        self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?;
        self.writer.write_all(&data).map_err(Error::Io)?;
        Ok(())
    }
-    fn serialize_entry<K: ?Sized, V: ?Sized>(
+    fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
        &mut self,
        key: &K,
        value: &V,
    ) -> Result<(), Self::Error>
    where
-        K: Serialize,
+        A: MapAccess<'de>,
        V: Serialize,
    {
-        let field_serializer = FieldSerializer { index: &mut self.index };
+        while let Some((key, value)) =
-        let field_id: FieldId = key.serialize(field_serializer)?;
+            map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
        {
            self.values.insert(key, value);
        }
-        let value = serde_json::to_value(value).map_err(Error::JsonError)?;
+        self.obkv_buffer.clear();
        let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer));
        for (key, value) in self.values.iter() {
            self.value_buffer.clear();
            // This is guaranteed to work
            tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value));
            tri!(obkv.insert(*key, &self.value_buffer));
        }
-        self.mapped_documents.insert(field_id, value);
+        let reader = tri!(obkv.into_inner()).into_inner();
-        Ok(())
+        tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32));
        tri!(self.inner.write_all(reader));
        *self.count += 1;
        self.values.clear();
        Ok(Ok(()))
    }
    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "a documents, or a sequence of documents.")
    }
 }
-struct FieldSerializer<'a> {
+impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
-    index: &'a mut DocumentsBatchIndex,
+where
-}
+    W: Write,
 {
    type Value = Result<(), Error>;
-impl<'a> serde::Serializer for FieldSerializer<'a> {
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
    type Ok = FieldId;
    type Error = Error;
    type SerializeSeq = Impossible<FieldId, Self::Error>;
    type SerializeTuple = Impossible<FieldId, Self::Error>;
    type SerializeTupleStruct = Impossible<FieldId, Self::Error>;
    type SerializeTupleVariant = Impossible<FieldId, Self::Error>;
    type SerializeMap = Impossible<FieldId, Self::Error>;
    type SerializeStruct = Impossible<FieldId, Self::Error>;
    type SerializeStructVariant = Impossible<FieldId, Self::Error>;
    fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> {
        let field_id = match self.index.get_by_right(ws) {
            Some(field_id) => *field_id,
            None => {
                let field_id = self.index.len() as FieldId;
                self.index.insert(field_id, ws.to_string());
                field_id
            }
        };
        Ok(field_id)
    }
    fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
    where
-        T: Serialize,
+        D: serde::Deserializer<'de>,
    {
-        Err(Error::InvalidDocumentFormat)
+        deserializer.deserialize_map(self)
    }
    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_unit_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
    ) -> Result<Self::Ok, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_newtype_struct<T: ?Sized>(
        self,
        _name: &'static str,
        _value: &T,
    ) -> Result<Self::Ok, Self::Error>
    where
        T: Serialize,
    {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_newtype_variant<T: ?Sized>(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _value: &T,
    ) -> Result<Self::Ok, Self::Error>
    where
        T: Serialize,
    {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_tuple_struct(
        self,
        _name: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_tuple_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_struct(
        self,
        _name: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeStruct, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
    fn serialize_struct_variant(
        self,
        _name: &'static str,
        _variant_index: u32,
        _variant: &'static str,
        _len: usize,
    ) -> Result<Self::SerializeStructVariant, Self::Error> {
        Err(Error::InvalidDocumentFormat)
    }
 }
 impl serde::ser::Error for Error {
    fn custom<T: fmt::Display>(msg: T) -> Self {
        Error::Custom(msg.to_string())
    }
 }
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::BTreeSet;
 use std::convert::Infallible;
 use std::error::Error as StdError;
 use std::{fmt, io, str};
@ -57,19 +57,17 @@ pub enum UserError {
    CriterionError(CriterionError),
    DocumentLimitReached,
    InvalidDocumentId { document_id: Value },
-    InvalidFacetsDistribution { invalid_facets_name: HashSet<String> },
+    InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
    InvalidGeoField { document_id: Value, object: Value },
    InvalidFilter(String),
-    InvalidSortName { name: String },
+    InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
    InvalidSortableAttribute { field: String, valid_fields: HashSet<String> },
    SortRankingRuleMissing,
    InvalidStoreFile,
    MaxDatabaseSizeReached,
-    MissingDocumentId { document: Object },
+    MissingDocumentId { primary_key: String, document: Object },
    MissingPrimaryKey,
    NoSpaceLeftOnDevice,
-    PrimaryKeyCannotBeChanged,
+    PrimaryKeyCannotBeChanged(String),
    PrimaryKeyCannotBeReset,
    SerdeJson(serde_json::Error),
    SortError(SortError),
    UnknownInternalDocumentId { document_id: DocumentId },
@ -168,7 +166,7 @@ impl From<SerializationError> for Error {
 impl fmt::Display for Error {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
-            Self::InternalError(error) => write!(f, "internal: {}", error),
+            Self::InternalError(error) => write!(f, "internal: {}.", error),
            Self::IoError(error) => error.fmt(f),
            Self::UserError(error) => error.fmt(f),
        }
@ -181,15 +179,15 @@ impl fmt::Display for InternalError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
            Self::DatabaseMissingEntry { db_name, key } => {
-                write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name)
+                write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name)
            }
            Self::FieldIdMapMissingEntry(error) => error.fmt(f),
            Self::Fst(error) => error.fmt(f),
            Self::GrenadInvalidCompressionType => {
-                f.write_str("invalid compression type have been specified to grenad")
+                f.write_str("Invalid compression type have been specified to grenad.")
            }
            Self::IndexingMergingKeys { process } => {
-                write!(f, "invalid merge while processing {}", process)
+                write!(f, "Invalid merge while processing {}.", process)
            }
            Self::Serialization(error) => error.fmt(f),
            Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
@ -207,69 +205,75 @@ impl StdError for InternalError {}
 impl fmt::Display for UserError {
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
        match self {
-            Self::InvalidFilter(input) => write!(f, "{}", input),
+            Self::InvalidFilter(error) => f.write_str(error),
-            Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"),
+            Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."),
            Self::CriterionError(error) => write!(f, "{}", error),
-            Self::DocumentLimitReached => f.write_str("maximum number of documents reached"),
+            Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."),
            Self::InvalidFacetsDistribution { invalid_facets_name } => {
                let name_list =
                    invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
                write!(
                    f,
-                    "invalid facet distribution, the fields {} are not set as filterable",
+                    "Invalid facet distribution, the fields `{}` are not set as filterable.",
                    name_list
                )
            }
-            Self::InvalidGeoField { document_id, object } => write!(
+            Self::InvalidGeoField { document_id, object } => {
-                f,
+                let document_id = match document_id {
-                "the document with the id: {} contains an invalid _geo field: {}",
+                    Value::String(id) => id.clone(),
-                document_id, object
+                    _ => document_id.to_string(),
-            ),
+                };
-            Self::InvalidDocumentId { document_id } => {
+                let object = match object {
-                let json = serde_json::to_string(document_id).unwrap();
+                    Value::String(id) => id.clone(),
                    _ => object.to_string(),
                };
                write!(
                    f,
-                    "document identifier is invalid {}, \
+                    "The document with the id: `{}` contains an invalid _geo field: `{}`.",
-a document id can be of type integer or string \
+                    document_id, object
-only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)",
+                )
-                    json
+            },
            Self::InvalidDocumentId { document_id } => {
                let document_id = match document_id {
                    Value::String(id) => id.clone(),
                    _ => document_id.to_string(),
                };
                write!(
                    f,
                    "Document identifier `{}` is invalid. \
 A document identifier can be of type integer or string, \
 only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).",
                    document_id
                )
            }
            Self::InvalidSortName { name } => {
                write!(f, "Invalid syntax for the sort parameter: {}", name)
            }
            Self::InvalidSortableAttribute { field, valid_fields } => {
                let valid_names =
                    valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
                write!(
                    f,
-                    "Attribute {} is not sortable, available sortable attributes are: {}",
+                    "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.",
                    field, valid_names
                )
            }
            Self::SortRankingRuleMissing => f.write_str(
-                "You must specify where \"sort\" is listed in the \
+                "The sort ranking rule must be specified in the \
-rankingRules setting to use the sort parameter at search time",
+ranking rules settings to use the sort parameter at search time.",
            ),
-            Self::MissingDocumentId { document } => {
+            Self::MissingDocumentId { primary_key, document } => {
                let json = serde_json::to_string(document).unwrap();
-                write!(f, "document doesn't have an identifier {}", json)
+                write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json)
            }
-            Self::MissingPrimaryKey => f.write_str("missing primary key"),
+            Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."),
-            Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"),
+            Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."),
-            // TODO where can we find it instead of writing the text ourselves?
+            Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."),
-            Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"),
+            Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."),
-            Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
+            Self::PrimaryKeyCannotBeChanged(primary_key) => {
-            Self::PrimaryKeyCannotBeChanged => {
+                write!(f, "Index already has a primary key: `{}`.", primary_key)
                f.write_str("primary key cannot be changed if the database contains documents")
            }
            Self::PrimaryKeyCannotBeReset => {
                f.write_str("primary key cannot be reset if the database contains documents")
            }
            Self::SerdeJson(error) => error.fmt(f),
            Self::SortError(error) => write!(f, "{}", error),
            Self::UnknownInternalDocumentId { document_id } => {
-                write!(f, "an unknown internal document id have been used ({})", document_id)
+                write!(f, "An unknown internal document id have been used: `{}`.", document_id)
            }
        }
    }
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@ -953,6 +953,7 @@ pub(crate) mod tests {
            { "id": 1, "name": "kevin", "has_dog": true },
            { "id": 2, "name": "bob" }
        ]);
        let mut wtxn = index.write_txn().unwrap();
        let builder = IndexDocuments::new(&mut wtxn, &index, 0);
        builder.execute(content, |_, _| ()).unwrap();
--- a/milli/src/search/distinct/mod.rs
+++ b/milli/src/search/distinct/mod.rs
@ -68,7 +68,9 @@ mod test {
                "txts": sample_txts[..(rng.gen_range(0..3))],
                "cat-ints": sample_ints[..(rng.gen_range(0..3))],
            });
-            builder.add_documents(doc).unwrap();
+
            let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
            builder.extend_from_json(doc).unwrap();
        }
        builder.finish().unwrap();
--- a/milli/src/search/facet/grammar.pest
+++ b/milli/src/search/facet/grammar.pest
@ -0,0 +1,33 @@
 key = _{reserved | quoted | word }
 value = _{quoted | word }
 quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP  }
 string = {char*}
 word = ${(LETTER | NUMBER | "_" | "-" | ".")+}
 char =  _{ !(PEEK | "\\") ~ ANY
    | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t")
    | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})}
 reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" }
 // we deliberately choose to allow empty parameters to generate more specific error message later
 parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""}
 condition = _{between | eq | greater | less | geq | leq | neq}
 between = {key ~ value ~ "TO" ~ value}
 geq = {key ~ ">=" ~ value}
 leq = {key ~ "<=" ~ value}
 neq = {key ~ "!=" ~ value}
 eq = {key ~ "=" ~ value}
 greater = {key ~ ">" ~ value}
 less = {key ~ "<" ~ value}
 geo_radius = {"_geoRadius" ~ parameters }
 prgm = {SOI ~ expr ~ EOI}
 expr = _{ ( term ~ (operation ~ term)* ) }
 term = { ("(" ~ expr ~ ")") | condition | not | geo_radius }
 operation = _{ and | or }
 and = {"AND"}
 or = {"OR"}
 not = {"NOT" ~ term}
 WHITESPACE = _{ " " }
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@ -151,13 +151,13 @@ impl<'a> Search<'a> {
                    Member::Field(ref field) if !sortable_fields.contains(field) => {
                        return Err(UserError::InvalidSortableAttribute {
                            field: field.to_string(),
-                            valid_fields: sortable_fields,
+                            valid_fields: sortable_fields.into_iter().collect(),
                        })?
                    }
                    Member::Geo(_) if !sortable_fields.contains("_geo") => {
                        return Err(UserError::InvalidSortableAttribute {
                            field: "_geo".to_string(),
-                            valid_fields: sortable_fields,
+                            valid_fields: sortable_fields.into_iter().collect(),
                        })?
                    }
                    _ => (),
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@ -877,7 +877,8 @@ mod tests {
        let mut cursor = Cursor::new(Vec::new());
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(big_object).unwrap();
+        let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
        builder.extend_from_json(big_object).unwrap();
        builder.finish().unwrap();
        cursor.set_position(0);
        let content = DocumentBatchReader::from_reader(cursor).unwrap();
@ -905,8 +906,9 @@ mod tests {
        let mut cursor = Cursor::new(Vec::new());
        let big_object = serde_json::to_string(&big_object).unwrap();
        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(big_object).unwrap();
+        builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
        builder.finish().unwrap();
        cursor.set_position(0);
        let content = DocumentBatchReader::from_reader(cursor).unwrap();
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@ -75,7 +75,7 @@ fn create_fields_mapping(
        .collect()
 }
-fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> {
+fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
    index
        .iter()
        .sorted_by_key(|(k, _)| *k)
@ -179,7 +179,7 @@ impl Transform<'_, '_> {
                        if !self.autogenerate_docids {
                            let mut json = Map::new();
                            for (key, value) in document.iter() {
-                                let key = addition_index.get_by_left(&key).cloned();
+                                let key = addition_index.name(key).cloned();
                                let value = serde_json::from_slice::<Value>(&value).ok();
                                if let Some((k, v)) = key.zip(value) {
@ -187,7 +187,11 @@ impl Transform<'_, '_> {
                                }
                            }
-                            return Err(UserError::MissingDocumentId { document: json }.into());
+                            return Err(UserError::MissingDocumentId {
                                primary_key: primary_key_name,
                                document: json,
                            }
                            .into());
                        }
                        let uuid =
@ -544,6 +548,7 @@ mod test {
    mod primary_key_inference {
        use bimap::BiHashMap;
        use crate::documents::DocumentsBatchIndex;
        use crate::update::index_documents::transform::find_primary_key;
        #[test]
@ -557,7 +562,7 @@ mod test {
                map.insert(4, "fakeId".to_string());
                map.insert(0, "realId".to_string());
-                assert_eq!(find_primary_key(&map), Some("realId"));
+                assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
            }
        }
    }
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    self.index.put_primary_key(self.wtxn, primary_key)?;
                    Ok(())
                } else {
-                    Err(UserError::PrimaryKeyCannotBeChanged.into())
+                    let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
                    Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
                }
            }
            Setting::Reset => {
@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                    self.index.delete_primary_key(self.wtxn)?;
                    Ok(())
                } else {
-                    Err(UserError::PrimaryKeyCannotBeReset.into())
+                    let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
                    Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
                }
            }
            Setting::NotSet => Ok(()),
@ -1106,7 +1108,7 @@ mod tests {
        builder.reset_primary_key();
        let err = builder.execute(|_, _| ()).unwrap_err();
-        assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset)));
+        assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_))));
        wtxn.abort().unwrap();
        // But if we clear the database...
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@ -61,9 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
    let mut cursor = Cursor::new(Vec::new());
    let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
    let reader = Cursor::new(CONTENT.as_bytes());
    for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
-        documents_builder.add_documents(doc.unwrap()).unwrap();
+        let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
        documents_builder.extend_from_json(doc).unwrap();
    }
    documents_builder.finish().unwrap();
    cursor.set_position(0);
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@ -409,7 +409,8 @@ fn criteria_ascdesc() {
            "age": age,
        });
-        batch_builder.add_documents(json).unwrap();
+        let json = Cursor::new(serde_json::to_vec(&json).unwrap());
        batch_builder.extend_from_json(json).unwrap();
    });
    batch_builder.finish().unwrap();
--- a/search/Cargo.toml
+++ b/search/Cargo.toml
@ -1,18 +0,0 @@
 [package]
 name = "search"
 version = "0.18.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 [dependencies]
 anyhow = "1.0.38"
 byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
 heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
 log = "0.4.14"
 milli = { path = "../milli" }
 serde_json = "1.0.62"
 stderrlog = "0.5.1"
 structopt = { version = "0.3.21", default-features = false }
 [target.'cfg(target_os = "linux")'.dependencies]
 jemallocator = "0.3.2"