From 048e174efb80daa7ca0ac302f53d36fe514d172e Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 10:13:48 +0200
Subject: [PATCH 01/35] Do not allocate when parsing CSV headers

---
 milli/src/documents/builder.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)
diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 2be7c1dd8..391175f31 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -108,7 +108,7 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
             .headers()?
             .into_iter()
             .map(parse_csv_header)
-            .map(|(k, t)| (this.index.insert(&k), t))
+            .map(|(k, t)| (this.index.insert(k), t))
             .collect::<BTreeMap<_, _>>();
 
         for (i, record) in records.into_records().enumerate() {
@@ -161,16 +161,16 @@ enum AllowedType {
     Number,
 }
 
-fn parse_csv_header(header: &str) -> (String, AllowedType) {
+fn parse_csv_header(header: &str) -> (&str, AllowedType) {
     // if there are several separators we only split on the last one.
     match header.rsplit_once(':') {
         Some((field_name, field_type)) => match field_type {
-            "string" => (field_name.to_string(), AllowedType::String),
-            "number" => (field_name.to_string(), AllowedType::Number),
+            "string" => (field_name, AllowedType::String),
+            "number" => (field_name, AllowedType::Number),
             // if the pattern isn't reconized, we keep the whole field.
-            _otherwise => (header.to_string(), AllowedType::String),
+            _otherwise => (header, AllowedType::String),
         },
-        None => (header.to_string(), AllowedType::String),
+        None => (header, AllowedType::String),
     }
 }
 

From eb63af1f1024063afc14eca909e99b4b370b2ff1 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 13:20:51 +0200
Subject: [PATCH 02/35] Update grenad to 0.4.2

---
 milli/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index feddf91c5..d980c6041 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" }
 fst = "0.4.7"
 fxhash = "0.2.1"
 geoutils = "0.4.1"
-grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] }
+grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] }
 heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] }
 json-depth-checker = { path = "../json-depth-checker" }
 levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }

From 419ce3966c42fcb66fd1dbdb239b4287bf55d74b Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 16:03:48 +0200
Subject: [PATCH 03/35] Rework the DocumentsBatchBuilder/Reader to use grenad

---
 milli/src/documents/builder.rs    | 216 +++++++++++++++---------------
 milli/src/documents/mod.rs        | 107 ++++++---------
 milli/src/documents/reader.rs     | 117 +++++++++-------
 milli/src/documents/serde_impl.rs | 134 ------------------
 4 files changed, 218 insertions(+), 356 deletions(-)
 delete mode 100644 milli/src/documents/serde_impl.rs

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 391175f31..159afb8d9 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -1,157 +1,159 @@
-use std::collections::BTreeMap;
-use std::io;
-use std::io::{Cursor, Write};
+use std::io::{self, Write};
 
-use byteorder::{BigEndian, WriteBytesExt};
-use serde::Deserializer;
-use serde_json::Value;
+use grenad::{CompressionType, WriterBuilder};
+use serde_json::{to_writer, Map, Value};
 
-use super::serde_impl::DocumentVisitor;
-use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
-use crate::FieldId;
+use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
 
 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
 /// format used by milli.
 ///
-/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
-/// iterate over the documents.
+/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader`
+/// to iterate over the documents.
 ///
 /// ## example:
 /// ```
-/// use milli::documents::DocumentBatchBuilder;
 /// use serde_json::json;
-/// use std::io::Cursor;
+/// use milli::documents::DocumentsBatchBuilder;
 ///
-/// let json = r##"{"id": 1, "name": "foo"}"##;
-/// let mut writer = Cursor::new(Vec::new());
-/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
-/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
-/// builder.finish().unwrap();
+/// let json = json!({ "id": 1, "name": "foo" });
+///
+/// let mut builder = DocumentsBatchBuilder::new(Vec::new());
+/// builder.append_json_object(json.as_object().unwrap()).unwrap();
+/// let _vector = builder.into_inner().unwrap();
 /// ```
-pub struct DocumentBatchBuilder<W> {
-    inner: ByteCounter<W>,
-    index: DocumentsBatchIndex,
+pub struct DocumentsBatchBuilder<W> {
+    /// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`.
+    writer: grenad::Writer<W>,
+    /// A map that creates the relation between field ids and field names.
+    fields_index: DocumentsBatchIndex,
+    /// The number of documents that were added to this builder,
+    /// it doesn't take the primary key of the documents into account at this point.
+    documents_count: u32,
+
+    /// A buffer to store a temporary obkv buffer and avoid reallocating.
     obkv_buffer: Vec<u8>,
+    /// A buffer to serialize the values and avoid reallocating,
+    /// serialized values are stored in an obkv.
     value_buffer: Vec<u8>,
-    values: BTreeMap<FieldId, Value>,
-    count: usize,
 }
 
-impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
-    pub fn new(writer: W) -> Result<Self, Error> {
-        let index = DocumentsBatchIndex::default();
-        let mut writer = ByteCounter::new(writer);
-        // add space to write the offset of the metadata at the end of the writer
-        writer.write_u64::<BigEndian>(0)?;
-
-        Ok(Self {
-            inner: writer,
-            index,
+impl<W: Write> DocumentsBatchBuilder<W> {
+    pub fn new(writer: W) -> DocumentsBatchBuilder<W> {
+        DocumentsBatchBuilder {
+            writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer),
+            fields_index: DocumentsBatchIndex::default(),
+            documents_count: 0,
             obkv_buffer: Vec::new(),
             value_buffer: Vec::new(),
-            values: BTreeMap::new(),
-            count: 0,
-        })
+        }
     }
 
-    /// Returns the number of documents that have been written to the builder.
-    pub fn len(&self) -> usize {
-        self.count
+    /// Returns the number of documents inserted into this builder.
+    pub fn documents_count(&self) -> u32 {
+        self.documents_count
     }
 
-    /// This method must be called after the document addition is terminated. It will put the
-    /// metadata at the end of the file, and write the metadata offset at the beginning on the
-    /// file.
-    pub fn finish(self) -> Result<usize, Error> {
-        let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
+    /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly.
+    pub fn append_json_object(&mut self, object: &Map<String, Value>) -> io::Result<()> {
+        // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
+        let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect();
+        fields_ids.sort_unstable();
 
-        let meta = DocumentsMetadata { count, index };
+        self.obkv_buffer.clear();
+        let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
+        for field_id in fields_ids {
+            let key = self.fields_index.name(field_id).unwrap();
+            self.value_buffer.clear();
+            to_writer(&mut self.value_buffer, &object[key])?;
+            writer.insert(field_id, &self.value_buffer)?;
+        }
 
-        bincode::serialize_into(&mut writer, &meta)?;
+        let internal_id = self.documents_count.to_be_bytes();
+        let document_bytes = writer.into_inner()?;
+        self.writer.insert(internal_id, &document_bytes)?;
+        self.documents_count += 1;
 
-        writer.seek(io::SeekFrom::Start(0))?;
-        writer.write_u64::<BigEndian>(offset as u64)?;
-
-        writer.flush()?;
-
-        Ok(count)
+        Ok(())
     }
 
-    /// Extends the builder with json documents from a reader.
-    pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
-        let mut de = serde_json::Deserializer::from_reader(reader);
-
-        let mut visitor = DocumentVisitor {
-            inner: &mut self.inner,
-            index: &mut self.index,
-            obkv_buffer: &mut self.obkv_buffer,
-            value_buffer: &mut self.value_buffer,
-            values: &mut self.values,
-            count: &mut self.count,
-        };
-
-        de.deserialize_any(&mut visitor).map_err(Error::JsonError)?
-    }
-
-    /// Creates a builder from a reader of CSV documents.
-    ///
-    /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
-    /// optimisations, and extending from another CSV is not allowed.
-    pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
-        let mut this = Self::new(writer)?;
-        // Ensure that this is the first and only addition made with this builder
-        debug_assert!(this.index.is_empty());
-
-        let mut records = csv::Reader::from_reader(reader);
-
-        let headers = records
+    /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly.
+    pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> {
+        // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
+        let mut typed_fields_ids: Vec<_> = reader
             .headers()?
             .into_iter()
             .map(parse_csv_header)
-            .map(|(k, t)| (this.index.insert(k), t))
-            .collect::<BTreeMap<_, _>>();
+            .map(|(k, t)| (self.fields_index.insert(k), t))
+            .enumerate()
+            .collect();
+        typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid);
 
-        for (i, record) in records.into_records().enumerate() {
-            let record = record?;
-            this.obkv_buffer.clear();
-            let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
-            for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
-                let value = match ty {
+        let mut record = csv::StringRecord::new();
+        let mut line = 0;
+        while reader.read_record(&mut record)? {
+            // We increment here and not at the end of the while loop to take
+            // the header offset into account.
+            line += 1;
+
+            self.obkv_buffer.clear();
+            let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
+
+            for (i, (field_id, type_)) in typed_fields_ids.iter() {
+                self.value_buffer.clear();
+
+                let value = &record[*i];
+                match type_ {
                     AllowedType::Number => {
                         if value.trim().is_empty() {
-                            Value::Null
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
                         } else {
-                            value.trim().parse::<f64>().map(Value::from).map_err(|error| {
-                                Error::ParseFloat {
-                                    error,
-                                    // +1 for the header offset.
-                                    line: i + 1,
-                                    value: value.to_string(),
+                            match value.trim().parse::<f64>() {
+                                Ok(float) => {
+                                    to_writer(&mut self.value_buffer, &float)?;
                                 }
-                            })?
+                                Err(error) => {
+                                    return Err(Error::ParseFloat {
+                                        error,
+                                        line,
+                                        value: value.to_string(),
+                                    });
+                                }
+                            }
                         }
                     }
                     AllowedType::String => {
                         if value.is_empty() {
-                            Value::Null
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
                         } else {
-                            Value::String(value.to_string())
+                            to_writer(&mut self.value_buffer, value)?;
                         }
                     }
-                };
+                }
 
-                this.value_buffer.clear();
-                serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
-                writer.insert(*fid, &this.value_buffer)?;
+                // We insert into the obkv writer the value buffer that has been filled just above.
+                writer.insert(*field_id, &self.value_buffer)?;
             }
 
-            this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
-            this.inner.write_all(&this.obkv_buffer)?;
-
-            this.count += 1;
+            let internal_id = self.documents_count.to_be_bytes();
+            let document_bytes = writer.into_inner()?;
+            self.writer.insert(internal_id, &document_bytes)?;
+            self.documents_count += 1;
         }
 
-        Ok(this)
+        Ok(())
+    }
+
+    /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`.
+    pub fn into_inner(mut self) -> io::Result<W> {
+        let DocumentsBatchBuilder { mut writer, fields_index, .. } = self;
+
+        // We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer.
+        self.value_buffer.clear();
+        to_writer(&mut self.value_buffer, &fields_index)?;
+        writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?;
+
+        writer.into_inner()
     }
 }
 
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 09f15901d..bd0afc6e4 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -1,24 +1,22 @@
 mod builder;
-/// The documents module defines an intermediary document format that milli uses for indexation, and
-/// provides an API to easily build and read such documents.
-///
-/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
-/// later be read by milli using the `DocumentBatchReader` interface.
 mod reader;
-mod serde_impl;
 
 use std::fmt::{self, Debug};
 use std::io;
 
 use bimap::BiHashMap;
-pub use builder::DocumentBatchBuilder;
-pub use reader::DocumentBatchReader;
+pub use builder::DocumentsBatchBuilder;
+pub use reader::{DocumentsBatchCursor, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
 
 use crate::FieldId;
 
+/// The key that is used to store the `DocumentsBatchIndex` datastructure,
+/// it is the absolute last key of the list.
+const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();
+
 /// A bidirectional map that links field ids to their name in a document batch.
-#[derive(Default, Debug, Serialize, Deserialize)]
+#[derive(Default, Clone, Debug, Serialize, Deserialize)]
 pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
 
 impl DocumentsBatchIndex {
@@ -46,8 +44,8 @@ impl DocumentsBatchIndex {
         self.0.iter()
     }
 
-    pub fn name(&self, id: FieldId) -> Option<&String> {
-        self.0.get_by_left(&id)
+    pub fn name(&self, id: FieldId) -> Option<&str> {
+        self.0.get_by_left(&id).map(AsRef::as_ref)
     }
 
     pub fn recreate_json(
@@ -69,50 +67,20 @@ impl DocumentsBatchIndex {
     }
 }
 
-#[derive(Debug, Serialize, Deserialize)]
-struct DocumentsMetadata {
-    count: usize,
-    index: DocumentsBatchIndex,
-}
-
-pub struct ByteCounter<W> {
-    count: usize,
-    writer: W,
-}
-
-impl<W> ByteCounter<W> {
-    fn new(writer: W) -> Self {
-        Self { count: 0, writer }
-    }
-}
-
-impl<W: io::Write> io::Write for ByteCounter<W> {
-    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
-        let count = self.writer.write(buf)?;
-        self.count += count;
-        Ok(count)
-    }
-
-    fn flush(&mut self) -> io::Result<()> {
-        self.writer.flush()
-    }
-}
-
 #[derive(Debug)]
 pub enum Error {
     ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
     InvalidDocumentFormat,
-    Custom(String),
-    JsonError(serde_json::Error),
-    CsvError(csv::Error),
-    Serialize(bincode::Error),
+    Csv(csv::Error),
+    Json(serde_json::Error),
+    Serialize(serde_json::Error),
+    Grenad(grenad::Error),
     Io(io::Error),
-    DocumentTooLarge,
 }
 
 impl From<csv::Error> for Error {
     fn from(e: csv::Error) -> Self {
-        Self::CsvError(e)
+        Self::Csv(e)
     }
 }
 
@@ -122,15 +90,15 @@ impl From<io::Error> for Error {
     }
 }
 
-impl From<bincode::Error> for Error {
-    fn from(other: bincode::Error) -> Self {
-        Self::Serialize(other)
+impl From<serde_json::Error> for Error {
+    fn from(other: serde_json::Error) -> Self {
+        Self::Json(other)
     }
 }
 
-impl From<serde_json::Error> for Error {
-    fn from(other: serde_json::Error) -> Self {
-        Self::JsonError(other)
+impl From<grenad::Error> for Error {
+    fn from(other: grenad::Error) -> Self {
+        Self::Grenad(other)
     }
 }
 
@@ -140,13 +108,14 @@ impl fmt::Display for Error {
             Error::ParseFloat { error, line, value } => {
                 write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
             }
-            Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
-            Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
-            Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
+            Error::InvalidDocumentFormat => {
+                f.write_str("Invalid document addition format, missing the documents batch index.")
+            }
             Error::Io(e) => write!(f, "{}", e),
-            Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
             Error::Serialize(e) => write!(f, "{}", e),
-            Error::CsvError(e) => write!(f, "{}", e),
+            Error::Grenad(e) => write!(f, "{}", e),
+            Error::Csv(e) => write!(f, "{}", e),
+            Error::Json(e) => write!(f, "{}", e),
         }
     }
 }
@@ -158,15 +127,25 @@ impl std::error::Error for Error {}
 macro_rules! documents {
     ($data:tt) => {{
         let documents = serde_json::json!($data);
-        let mut writer = std::io::Cursor::new(Vec::new());
-        let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
-        let documents = serde_json::to_vec(&documents).unwrap();
-        builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
-        builder.finish().unwrap();
+        let documents = match documents {
+            object @ serde_json::Value::Object(_) => vec![object],
+            serde_json::Value::Array(objects) => objects,
+            invalid => {
+                panic!("an array of objects must be specified, {:#?} is not an array", invalid)
+            }
+        };
 
-        writer.set_position(0);
+        let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
+        for document in documents {
+            let object = match document {
+                serde_json::Value::Object(object) => object,
+                invalid => panic!("an object must be specified, {:#?} is not an object", invalid),
+            };
+            builder.append_json_object(&object).unwrap();
+        }
 
-        crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
+        let vector = builder.into_inner().unwrap();
+        crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap()
     }};
 }
 
diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs
index 14d7c8ceb..3dff999f5 100644
--- a/milli/src/documents/reader.rs
+++ b/milli/src/documents/reader.rs
@@ -1,11 +1,9 @@
+use std::convert::TryInto;
 use std::io;
-use std::io::{BufReader, Read};
-use std::mem::size_of;
 
-use byteorder::{BigEndian, ReadBytesExt};
 use obkv::KvReader;
 
-use super::{DocumentsBatchIndex, DocumentsMetadata, Error};
+use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
 use crate::FieldId;
 
 /// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
@@ -13,63 +11,80 @@ use crate::FieldId;
 ///
 /// The documents are returned in the form of `obkv::Reader` where each field is identified with a
 /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
-pub struct DocumentBatchReader<R> {
-    reader: BufReader<R>,
-    metadata: DocumentsMetadata,
-    buffer: Vec<u8>,
-    seen_documents: usize,
+pub struct DocumentsBatchReader<R> {
+    cursor: grenad::ReaderCursor<R>,
+    fields_index: DocumentsBatchIndex,
 }
 
-impl<R: io::Read + io::Seek> DocumentBatchReader<R> {
+impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
     /// Construct a `DocumentsReader` from a reader.
     ///
-    /// It first retrieves the index, then moves to the first document. Subsequent calls to
-    /// `next_document` advance the document reader until all the documents have been read.
-    pub fn from_reader(mut reader: R) -> Result<Self, Error> {
-        let mut buffer = Vec::new();
+    /// It first retrieves the index, then moves to the first document. Use the `into_cursor`
+    /// method to iterator over the documents, from the first to the last.
+    pub fn from_reader(reader: R) -> Result<Self, Error> {
+        let reader = grenad::Reader::new(reader)?;
+        let mut cursor = reader.into_cursor()?;
 
-        let meta_offset = reader.read_u64::<BigEndian>()?;
-        reader.seek(io::SeekFrom::Start(meta_offset))?;
-        reader.read_to_end(&mut buffer)?;
-        let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?;
+        let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? {
+            Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?,
+            None => return Err(Error::InvalidDocumentFormat),
+        };
 
-        reader.seek(io::SeekFrom::Start(size_of::<u64>() as u64))?;
-        buffer.clear();
-
-        let reader = BufReader::new(reader);
-
-        Ok(Self { reader, metadata, buffer, seen_documents: 0 })
+        Ok(DocumentsBatchReader { cursor, fields_index })
     }
 
-    /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a
-    /// reference to the addition index.
-    pub fn next_document_with_index<'a>(
-        &'a mut self,
-    ) -> io::Result<Option<(&'a DocumentsBatchIndex, KvReader<'a, FieldId>)>> {
-        if self.seen_documents < self.metadata.count {
-            let doc_len = self.reader.read_u32::<BigEndian>()?;
-            self.buffer.resize(doc_len as usize, 0);
-            self.reader.read_exact(&mut self.buffer)?;
-            self.seen_documents += 1;
-
-            let reader = KvReader::new(&self.buffer);
-            Ok(Some((&self.metadata.index, reader)))
-        } else {
-            Ok(None)
-        }
-    }
-
-    /// Return the fields index for the documents batch.
-    pub fn index(&self) -> &DocumentsBatchIndex {
-        &self.metadata.index
-    }
-
-    /// Returns the number of documents in the reader.
-    pub fn len(&self) -> usize {
-        self.metadata.count
+    pub fn documents_count(&self) -> u32 {
+        self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents")
     }
 
     pub fn is_empty(&self) -> bool {
-        self.len() == 0
+        self.cursor.len().saturating_sub(1) == 0
+    }
+
+    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
+        &self.fields_index
+    }
+
+    /// This method returns a forward cursor over the documents.
+    pub fn into_cursor(self) -> DocumentsBatchCursor<R> {
+        let DocumentsBatchReader { cursor, fields_index } = self;
+        let mut cursor = DocumentsBatchCursor { cursor, fields_index };
+        cursor.reset();
+        cursor
+    }
+}
+
+/// A forward cursor over the documents in a `DocumentsBatchReader`.
+pub struct DocumentsBatchCursor<R> {
+    cursor: grenad::ReaderCursor<R>,
+    fields_index: DocumentsBatchIndex,
+}
+
+impl<R> DocumentsBatchCursor<R> {
+    pub fn into_reader(self) -> DocumentsBatchReader<R> {
+        let DocumentsBatchCursor { cursor, fields_index, .. } = self;
+        DocumentsBatchReader { cursor, fields_index }
+    }
+
+    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
+        &self.fields_index
+    }
+
+    /// Resets the cursor to be able to read from the start again.
+    pub fn reset(&mut self) {
+        self.cursor.reset();
+    }
+}
+
+impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
+    /// Returns the next document, starting from the first one. Subsequent calls to
+    /// `next_document` advance the document reader until all the documents have been read.
+    pub fn next_document(&mut self) -> Result<Option<KvReader<FieldId>>, grenad::Error> {
+        match self.cursor.move_on_next()? {
+            Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => {
+                Ok(Some(KvReader::new(value)))
+            }
+            _otherwise => Ok(None),
+        }
     }
 }
diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs
deleted file mode 100644
index d57bf1ffb..000000000
--- a/milli/src/documents/serde_impl.rs
+++ /dev/null
@@ -1,134 +0,0 @@
-use std::collections::BTreeMap;
-use std::fmt;
-use std::io::{Cursor, Write};
-
-use byteorder::WriteBytesExt;
-use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
-use serde::Deserialize;
-use serde_json::Value;
-
-use super::{ByteCounter, DocumentsBatchIndex, Error};
-use crate::FieldId;
-
-macro_rules! tri {
-    ($e:expr) => {
-        match $e {
-            Ok(r) => r,
-            Err(e) => return Ok(Err(e.into())),
-        }
-    };
-}
-
-struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);
-
-impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
-    type Value = FieldId;
-
-    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        deserializer.deserialize_str(self)
-    }
-}
-
-impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
-    type Value = FieldId;
-
-    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-    where
-        E: serde::de::Error,
-    {
-        Ok(self.0.insert(v))
-    }
-
-    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "a string")
-    }
-}
-
-struct ValueDeserializer;
-
-impl<'de> DeserializeSeed<'de> for ValueDeserializer {
-    type Value = serde_json::Value;
-
-    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        serde_json::Value::deserialize(deserializer)
-    }
-}
-
-pub struct DocumentVisitor<'a, W> {
-    pub inner: &'a mut ByteCounter<W>,
-    pub index: &'a mut DocumentsBatchIndex,
-    pub obkv_buffer: &'a mut Vec<u8>,
-    pub value_buffer: &'a mut Vec<u8>,
-    pub values: &'a mut BTreeMap<FieldId, Value>,
-    pub count: &'a mut usize,
-}
-
-impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
-    /// This Visitor value is nothing, since it write the value to a file.
-    type Value = Result<(), Error>;
-
-    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
-    where
-        A: SeqAccess<'de>,
-    {
-        while let Some(v) = seq.next_element_seed(&mut *self)? {
-            tri!(v)
-        }
-
-        Ok(Ok(()))
-    }
-
-    fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
-    where
-        A: MapAccess<'de>,
-    {
-        while let Some((key, value)) =
-            map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
-        {
-            self.values.insert(key, value);
-        }
-
-        self.obkv_buffer.clear();
-        let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer));
-        for (key, value) in self.values.iter() {
-            self.value_buffer.clear();
-            // This is guaranteed to work
-            tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value));
-            tri!(obkv.insert(*key, &self.value_buffer));
-        }
-
-        let reader = tri!(obkv.into_inner()).into_inner();
-
-        tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32));
-        tri!(self.inner.write_all(reader));
-
-        *self.count += 1;
-        self.values.clear();
-
-        Ok(Ok(()))
-    }
-
-    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        write!(f, "a documents, or a sequence of documents.")
-    }
-}
-
-impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
-where
-    W: Write,
-{
-    type Value = Result<(), Error>;
-
-    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        deserializer.deserialize_map(self)
-    }
-}

From e8297ad27e4977f4f43c43f181dc9d7c9ea041dd Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 16:04:27 +0200
Subject: [PATCH 04/35] Fix the tests for the new DocumentsBatchBuilder/Reader

---
 milli/src/documents/builder.rs                | 332 +++++++++---------
 milli/src/documents/mod.rs                    | 133 ++-----
 milli/src/search/distinct/mod.rs              |  31 +-
 milli/src/update/index_documents/mod.rs       |  63 ++--
 milli/src/update/index_documents/transform.rs |  12 +-
 milli/tests/search/facet_distribution.rs      |  26 +-
 milli/tests/search/mod.rs                     |  18 +-
 milli/tests/search/query_criteria.rs          |  20 +-
 milli/tests/search/typo_tolerance.rs          |  31 +-
 9 files changed, 292 insertions(+), 374 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 159afb8d9..19cc1ce53 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -183,7 +183,8 @@ mod test {
     use serde_json::{json, Map};
 
     use super::*;
-    use crate::documents::DocumentBatchReader;
+    use crate::documents::DocumentsBatchReader;
+    use crate::FieldId;
 
     fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
         let mut map = Map::new();
@@ -192,7 +193,7 @@ mod test {
             let field_name = index.name(fid).unwrap().clone();
             let value: Value = serde_json::from_slice(value).unwrap();
 
-            map.insert(field_name, value);
+            map.insert(field_name.to_string(), value);
         }
 
         Value::Object(map)
@@ -200,15 +201,13 @@ mod test {
 
     #[test]
     fn add_single_documents_json() {
-        let mut cursor = Cursor::new(Vec::new());
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-
         let json = serde_json::json!({
             "id": 1,
             "field": "hello!",
         });
 
-        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_json_object(json.as_object().unwrap()).unwrap();
 
         let json = serde_json::json!({
             "blabla": false,
@@ -216,100 +215,64 @@ mod test {
             "id": 1,
         });
 
-        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+        builder.append_json_object(json.as_object().unwrap()).unwrap();
 
-        assert_eq!(builder.len(), 2);
+        assert_eq!(builder.documents_count(), 2);
+        let vector = builder.into_inner().unwrap();
 
-        builder.finish().unwrap();
-
-        cursor.set_position(0);
-
-        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
-
-        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
         assert_eq!(index.len(), 3);
+
+        let document = cursor.next_document().unwrap().unwrap();
         assert_eq!(document.iter().count(), 2);
 
-        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
-        assert_eq!(index.len(), 3);
+        let document = cursor.next_document().unwrap().unwrap();
         assert_eq!(document.iter().count(), 3);
 
-        assert!(reader.next_document_with_index().unwrap().is_none());
-    }
-
-    #[test]
-    fn add_documents_seq_json() {
-        let mut cursor = Cursor::new(Vec::new());
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-
-        let json = serde_json::json!([{
-            "id": 1,
-            "field": "hello!",
-        },{
-            "blabla": false,
-            "field": "hello!",
-            "id": 1,
-        }
-        ]);
-
-        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
-
-        assert_eq!(builder.len(), 2);
-
-        builder.finish().unwrap();
-
-        cursor.set_position(0);
-
-        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
-
-        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
-        assert_eq!(index.len(), 3);
-        assert_eq!(document.iter().count(), 2);
-
-        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
-        assert_eq!(index.len(), 3);
-        assert_eq!(document.iter().count(), 3);
-
-        assert!(reader.next_document_with_index().unwrap().is_none());
+        assert!(cursor.next_document().unwrap().is_none());
     }
 
     #[test]
     fn add_documents_csv() {
-        let mut cursor = Cursor::new(Vec::new());
+        let csv_content = "id:number,field:string\n1,hello!\n2,blabla";
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let csv = "id:number,field:string\n1,hello!\n2,blabla";
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        assert_eq!(builder.documents_count(), 2);
+        let vector = builder.into_inner().unwrap();
 
-        let builder =
-            DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
-        builder.finish().unwrap();
-
-        cursor.set_position(0);
-
-        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
-
-        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
         assert_eq!(index.len(), 2);
+
+        let document = cursor.next_document().unwrap().unwrap();
         assert_eq!(document.iter().count(), 2);
 
-        let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
+        let document = cursor.next_document().unwrap().unwrap();
         assert_eq!(document.iter().count(), 2);
 
-        assert!(reader.next_document_with_index().unwrap().is_none());
+        assert!(cursor.next_document().unwrap().is_none());
     }
 
     #[test]
     fn simple_csv_document() {
-        let documents = r#"city,country,pop
+        let csv_content = r#"city,country,pop
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -320,22 +283,25 @@ mod test {
             })
         );
 
-        assert!(reader.next_document_with_index().unwrap().is_none());
+        assert!(cursor.next_document().unwrap().is_none());
     }
 
     #[test]
     fn coma_in_field() {
-        let documents = r#"city,country,pop
+        let csv_content = r#"city,country,pop
 "Boston","United, States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -349,17 +315,20 @@ mod test {
 
     #[test]
     fn quote_in_field() {
-        let documents = r#"city,country,pop
+        let csv_content = r#"city,country,pop
 "Boston","United"" States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -373,17 +342,20 @@ mod test {
 
     #[test]
     fn integer_in_field() {
-        let documents = r#"city,country,pop:number
+        let csv_content = r#"city,country,pop:number
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -397,17 +369,20 @@ mod test {
 
     #[test]
     fn float_in_field() {
-        let documents = r#"city,country,pop:number
+        let csv_content = r#"city,country,pop:number
 "Boston","United States","4628910.01""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -421,17 +396,20 @@ mod test {
 
     #[test]
     fn several_colon_in_header() {
-        let documents = r#"city:love:string,country:state,pop
+        let csv_content = r#"city:love:string,country:state,pop
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -445,17 +423,20 @@ mod test {
 
     #[test]
     fn ending_by_colon_in_header() {
-        let documents = r#"city:,country,pop
+        let csv_content = r#"city:,country,pop
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -469,17 +450,20 @@ mod test {
 
     #[test]
     fn starting_by_colon_in_header() {
-        let documents = r#":city,country,pop
+        let csv_content = r#":city,country,pop
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -494,32 +478,36 @@ mod test {
     #[ignore]
     #[test]
     fn starting_by_colon_in_header2() {
-        let documents = r#":string,country,pop
+        let csv_content = r#":string,country,pop
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
 
-        assert!(reader.next_document_with_index().is_err());
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+
+        assert!(cursor.next_document().is_err());
     }
 
     #[test]
     fn double_colon_in_header() {
-        let documents = r#"city::string,country,pop
+        let csv_content = r#"city::string,country,pop
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
-            .unwrap()
-            .finish()
-            .unwrap();
-        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
-        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
-        let val = obkv_to_value(&doc, index);
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        let mut cursor =
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let index = cursor.documents_batch_index().clone();
+
+        let doc = cursor.next_document().unwrap().unwrap();
+        let val = obkv_to_value(&doc, &index);
 
         assert_eq!(
             val,
@@ -533,34 +521,32 @@ mod test {
 
     #[test]
     fn bad_type_in_header() {
-        let documents = r#"city,country:number,pop
+        let csv_content = r#"city,country:number,pop
 "Boston","United States","4628910""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        assert!(
-            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
-        );
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        assert!(builder.append_csv(csv).is_err());
     }
 
     #[test]
     fn bad_column_count1() {
-        let documents = r#"city,country,pop
-"Boston","United States","4628910", "too much""#;
+        let csv_content = r#"city,country,pop
+"Boston","United States","4628910", "too much
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content"#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        assert!(
-            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
-        );
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        assert!(builder.append_csv(csv).is_err());
     }
 
     #[test]
     fn bad_column_count2() {
-        let documents = r#"city,country,pop
+        let csv_content = r#"city,country,pop
 "Boston","United States""#;
+        let csv = csv::Reader::from_reader(Cursor::new(csv_content));
 
-        let mut buf = Vec::new();
-        assert!(
-            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
-        );
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        assert!(builder.append_csv(csv).is_err());
     }
 }
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index bd0afc6e4..7a34ae13b 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -159,7 +159,7 @@ mod test {
 
     #[test]
     fn create_documents_no_errors() {
-        let json = json!({
+        let value = json!({
             "number": 1,
             "string": "this is a field",
             "array": ["an", "array"],
@@ -169,26 +169,17 @@ mod test {
             "bool": true
         });
 
-        let json = serde_json::to_vec(&json).unwrap();
-
-        let mut v = Vec::new();
-        let mut cursor = io::Cursor::new(&mut v);
-
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-
-        builder.extend_from_json(Cursor::new(json)).unwrap();
-
-        builder.finish().unwrap();
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_json_object(value.as_object().unwrap()).unwrap();
+        let vector = builder.into_inner().unwrap();
 
         let mut documents =
-            DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
+            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
 
-        assert_eq!(documents.index().iter().count(), 5);
-
-        let reader = documents.next_document_with_index().unwrap().unwrap();
-
-        assert_eq!(reader.1.iter().count(), 5);
-        assert!(documents.next_document_with_index().unwrap().is_none());
+        assert_eq!(documents.documents_batch_index().iter().count(), 5);
+        let reader = documents.next_document().unwrap().unwrap();
+        assert_eq!(reader.iter().count(), 5);
+        assert!(documents.next_document().unwrap().is_none());
     }
 
     #[test]
@@ -200,101 +191,55 @@ mod test {
             "toto": false,
         });
 
-        let doc1 = serde_json::to_vec(&doc1).unwrap();
-        let doc2 = serde_json::to_vec(&doc2).unwrap();
-
-        let mut v = Vec::new();
-        let mut cursor = io::Cursor::new(&mut v);
-
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-
-        builder.extend_from_json(Cursor::new(doc1)).unwrap();
-        builder.extend_from_json(Cursor::new(doc2)).unwrap();
-
-        builder.finish().unwrap();
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_json_object(doc1.as_object().unwrap()).unwrap();
+        builder.append_json_object(doc2.as_object().unwrap()).unwrap();
+        let vector = builder.into_inner().unwrap();
 
         let mut documents =
-            DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
-
-        assert_eq!(documents.index().iter().count(), 2);
-
-        let reader = documents.next_document_with_index().unwrap().unwrap();
-
-        assert_eq!(reader.1.iter().count(), 1);
-        assert!(documents.next_document_with_index().unwrap().is_some());
-        assert!(documents.next_document_with_index().unwrap().is_none());
-    }
-
-    #[test]
-    fn add_documents_array() {
-        let docs = json!([
-            { "toto": false },
-            { "tata": "hello" },
-        ]);
-
-        let docs = serde_json::to_vec(&docs).unwrap();
-
-        let mut v = Vec::new();
-        let mut cursor = io::Cursor::new(&mut v);
-
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-
-        builder.extend_from_json(Cursor::new(docs)).unwrap();
-
-        builder.finish().unwrap();
-
-        let mut documents =
-            DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap();
-
-        assert_eq!(documents.index().iter().count(), 2);
-
-        let reader = documents.next_document_with_index().unwrap().unwrap();
-
-        assert_eq!(reader.1.iter().count(), 1);
-        assert!(documents.next_document_with_index().unwrap().is_some());
-        assert!(documents.next_document_with_index().unwrap().is_none());
-    }
-
-    #[test]
-    fn add_invalid_document_format() {
-        let mut v = Vec::new();
-        let mut cursor = io::Cursor::new(&mut v);
-
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-
-        let docs = json!([[
-            { "toto": false },
-            { "tata": "hello" },
-        ]]);
-
-        let docs = serde_json::to_vec(&docs).unwrap();
-        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
-
-        let docs = json!("hello");
-        let docs = serde_json::to_vec(&docs).unwrap();
-
-        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
+            DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor();
+        assert_eq!(documents.documents_batch_index().iter().count(), 2);
+        let reader = documents.next_document().unwrap().unwrap();
+        assert_eq!(reader.iter().count(), 1);
+        assert!(documents.next_document().unwrap().is_some());
+        assert!(documents.next_document().unwrap().is_none());
     }
 
     #[test]
     fn test_nested() {
-        let mut docs = documents!([{
+        let docs_reader = documents!([{
             "hello": {
                 "toto": ["hello"]
             }
         }]);
 
-        let (_index, doc) = docs.next_document_with_index().unwrap().unwrap();
-
+        let mut cursor = docs_reader.into_cursor();
+        let doc = cursor.next_document().unwrap().unwrap();
         let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
         assert_eq!(nested, json!({ "toto": ["hello"] }));
     }
 
     #[test]
-    fn out_of_order_fields() {
+    fn out_of_order_json_fields() {
         let _documents = documents!([
             {"id": 1,"b": 0},
             {"id": 2,"a": 0,"b": 0},
         ]);
     }
+
+    #[test]
+    fn out_of_order_csv_fields() {
+        let csv1_content = "id:number,b\n1,0";
+        let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content));
+
+        let csv2_content = "id:number,a,b\n2,0,0";
+        let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content));
+
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_csv(csv1).unwrap();
+        builder.append_csv(csv2).unwrap();
+        let vector = builder.into_inner().unwrap();
+
+        DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
+    }
 }
diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs
index 237fd718a..670fa01ac 100644
--- a/milli/src/search/distinct/mod.rs
+++ b/milli/src/search/distinct/mod.rs
@@ -35,7 +35,7 @@ mod test {
     use roaring::RoaringBitmap;
     use serde_json::{json, Value};
 
-    use crate::documents::{DocumentBatchBuilder, DocumentBatchReader};
+    use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
     use crate::index::tests::TempIndex;
     use crate::index::Index;
     use crate::update::{
@@ -43,14 +43,11 @@ mod test {
     };
     use crate::{DocumentId, FieldId, BEU32};
 
-    static JSON: Lazy<Vec<u8>> = Lazy::new(generate_documents);
-
-    fn generate_documents() -> Vec<u8> {
+    static JSON: Lazy<Vec<u8>> = Lazy::new(|| {
         let mut rng = rand::thread_rng();
         let num_docs = rng.gen_range(10..30);
 
-        let mut cursor = Cursor::new(Vec::new());
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
         let txts = ["Toto", "Titi", "Tata"];
         let cats = (1..10).map(|i| i.to_string()).collect::<Vec<_>>();
         let cat_ints = (1..10).collect::<Vec<_>>();
@@ -63,7 +60,7 @@ mod test {
             let mut sample_ints = cat_ints.clone();
             sample_ints.shuffle(&mut rng);
 
-            let doc = json!({
+            let json = json!({
                 "id": i,
                 "txt": txt,
                 "cat-int": rng.gen_range(0..3),
@@ -71,13 +68,16 @@ mod test {
                 "cat-ints": sample_ints[..(rng.gen_range(0..3))],
             });
 
-            let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
-            builder.extend_from_json(doc).unwrap();
+            let object = match json {
+                Value::Object(object) => object,
+                _ => panic!(),
+            };
+
+            builder.append_json_object(&object).unwrap();
         }
 
-        builder.finish().unwrap();
-        cursor.into_inner()
-    }
+        builder.into_inner().unwrap()
+    });
 
     /// Returns a temporary index populated with random test documents, the FieldId for the
     /// distinct attribute, and the RoaringBitmap with the document ids.
@@ -101,7 +101,8 @@ mod test {
             IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
 
         let reader =
-            crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
+            crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice()))
+                .unwrap();
 
         addition.add_documents(reader).unwrap();
         addition.execute().unwrap();
@@ -109,8 +110,8 @@ mod test {
         let fields_map = index.fields_ids_map(&txn).unwrap();
         let fid = fields_map.id(&distinct).unwrap();
 
-        let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap();
-        let map = (0..documents.len() as u32).collect();
+        let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap();
+        let map = (0..documents.documents_count() as u32).collect();
 
         txn.commit().unwrap();
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ba428f078..7f6e00b11 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -25,7 +25,7 @@ pub use self::helpers::{
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
-use crate::documents::DocumentBatchReader;
+use crate::documents::DocumentsBatchReader;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
@@ -121,7 +121,7 @@ where
     /// builder, and the builder must be discarded.
     ///
     /// Returns the number of documents added to the builder.
-    pub fn add_documents<R>(&mut self, reader: DocumentBatchReader<R>) -> Result<u64>
+    pub fn add_documents<R>(&mut self, reader: DocumentsBatchReader<R>) -> Result<u64>
     where
         R: Read + Seek,
     {
@@ -590,9 +590,8 @@ mod tests {
     use maplit::hashset;
 
     use super::*;
-    use crate::documents::DocumentBatchBuilder;
+    use crate::documents::DocumentsBatchBuilder;
     use crate::update::DeleteDocuments;
-    use crate::HashMap;
 
     #[test]
     fn simple_document_replacement() {
@@ -1252,21 +1251,17 @@ mod tests {
 
         let mut wtxn = index.write_txn().unwrap();
 
-        let mut big_object = HashMap::new();
-        big_object.insert(S("id"), "wow");
+        let mut big_object = serde_json::Map::new();
+        big_object.insert(S("id"), serde_json::Value::from("wow"));
         for i in 0..1000 {
             let key = i.to_string();
-            big_object.insert(key, "I am a text!");
+            big_object.insert(key, serde_json::Value::from("I am a text!"));
         }
 
-        let mut cursor = Cursor::new(Vec::new());
-
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
-        builder.extend_from_json(big_object).unwrap();
-        builder.finish().unwrap();
-        cursor.set_position(0);
-        let content = DocumentBatchReader::from_reader(cursor).unwrap();
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_json_object(&big_object).unwrap();
+        let vector = builder.into_inner().unwrap();
+        let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
@@ -1288,23 +1283,19 @@ mod tests {
 
         let mut wtxn = index.write_txn().unwrap();
 
-        let mut big_object = HashMap::new();
-        big_object.insert(S("id"), "wow");
+        let mut big_object = serde_json::Map::new();
+        big_object.insert(S("id"), serde_json::Value::from("wow"));
         let content: String = (0..=u16::MAX)
             .into_iter()
             .map(|p| p.to_string())
             .reduce(|a, b| a + " " + b.as_ref())
             .unwrap();
-        big_object.insert("content".to_string(), &content);
+        big_object.insert("content".to_string(), serde_json::Value::from(content));
 
-        let mut cursor = Cursor::new(Vec::new());
-
-        let big_object = serde_json::to_string(&big_object).unwrap();
-        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
-        builder.finish().unwrap();
-        cursor.set_position(0);
-        let content = DocumentBatchReader::from_reader(cursor).unwrap();
+        let mut builder = DocumentsBatchBuilder::new(Vec::new());
+        builder.append_json_object(&big_object).unwrap();
+        let vector = builder.into_inner().unwrap();
+        let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
@@ -1843,18 +1834,20 @@ mod tests {
 
         // Create 200 documents with a long text
         let content = {
-            let documents: Vec<_> = (0..200i32)
+            let documents_iter = (0..200i32)
                 .into_iter()
                 .map(|i| serde_json::json!({ "id": i, "script": script }))
-                .collect();
+                .filter_map(|json| match json {
+                    serde_json::Value::Object(object) => Some(object),
+                    _ => None,
+                });
 
-            let mut writer = std::io::Cursor::new(Vec::new());
-            let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
-            let documents = serde_json::to_vec(&documents).unwrap();
-            builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
-            builder.finish().unwrap();
-            writer.set_position(0);
-            crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
+            let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
+            for object in documents_iter {
+                builder.append_json_object(&object).unwrap();
+            }
+            let vector = builder.into_inner().unwrap();
+            crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap()
         };
 
         // Index those 200 long documents
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 12a858024..129357075 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -14,7 +14,7 @@ use smartstring::SmartString;
 
 use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
 use super::{IndexDocumentsMethod, IndexerConfig};
-use crate::documents::{DocumentBatchReader, DocumentsBatchIndex};
+use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::db_name;
 use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
@@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> {
 
     pub fn read_documents<R, F>(
         &mut self,
-        mut reader: DocumentBatchReader<R>,
+        reader: DocumentsBatchReader<R>,
         wtxn: &mut heed::RwTxn,
         progress_callback: F,
     ) -> Result<usize>
@@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> {
         R: Read + Seek,
         F: Fn(UpdateIndexingStep) + Sync,
     {
-        let fields_index = reader.index();
+        let mut cursor = reader.into_cursor();
+        let fields_index = cursor.documents_batch_index();
         let external_documents_ids = self.index.external_documents_ids(wtxn)?;
 
         let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
@@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mut documents_count = 0;
         let mut external_id_buffer = Vec::new();
         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
-        while let Some((addition_index, document)) = reader.next_document_with_index()? {
+        let addition_index = cursor.documents_batch_index().clone();
+        while let Some(document) = cursor.next_document()? {
             let mut field_buffer_cache = drop_and_reuse(field_buffer);
             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
                 progress_callback(UpdateIndexingStep::RemapDocumentAddition {
@@ -840,7 +842,7 @@ fn update_primary_key<'a>(
         None => {
             let mut json = Map::new();
             for (key, value) in document.iter() {
-                let key = addition_index.name(key).cloned();
+                let key = addition_index.name(key).map(ToString::to_string);
                 let value = serde_json::from_slice::<Value>(&value).ok();
 
                 if let Some((k, v)) = key.zip(value) {
diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs
index d3aece2ab..66713de1e 100644
--- a/milli/tests/search/facet_distribution.rs
+++ b/milli/tests/search/facet_distribution.rs
@@ -3,9 +3,10 @@ use std::io::Cursor;
 use big_s::S;
 use heed::EnvOpenOptions;
 use maplit::hashset;
-use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
+use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::{FacetDistribution, Index};
+use serde_json::{Deserializer, Map, Value};
 
 #[test]
 fn test_facet_distribution_with_no_facet_values() {
@@ -30,35 +31,30 @@ fn test_facet_distribution_with_no_facet_values() {
 
     let mut builder =
         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-    let mut cursor = Cursor::new(Vec::new());
-    let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+    let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
     let reader = Cursor::new(
-        r#"[
-        {
+        r#"{
             "id": 123,
             "title": "What a week, hu...",
             "genres": [],
             "tags": ["blue"]
-        },
+        }
         {
             "id": 345,
             "title": "I am the pig!",
             "tags": ["red"]
-        }
-    ]"#,
+        }"#,
     );
 
-    for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
-        let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
-        documents_builder.extend_from_json(doc).unwrap();
+    for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+        let object = result.unwrap();
+        documents_builder.append_json_object(&object).unwrap();
     }
 
-    documents_builder.finish().unwrap();
-
-    cursor.set_position(0);
+    let vector = documents_builder.into_inner().unwrap();
 
     // index documents
-    let content = DocumentBatchReader::from_reader(cursor).unwrap();
+    let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
     builder.add_documents(content).unwrap();
     builder.execute().unwrap();
 
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index 472fbafe0..4cf117dc7 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -6,10 +6,11 @@ use big_s::S;
 use either::{Either, Left, Right};
 use heed::EnvOpenOptions;
 use maplit::{hashmap, hashset};
-use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
+use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::{AscDesc, Criterion, DocumentId, Index, Member};
 use serde::Deserialize;
+use serde_json::{Deserializer, Map, Value};
 use slice_group_by::GroupBy;
 
 mod distinct;
@@ -62,21 +63,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
 
     let mut builder =
         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-    let mut cursor = Cursor::new(Vec::new());
-    let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+    let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
     let reader = Cursor::new(CONTENT.as_bytes());
 
-    for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
-        let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
-        documents_builder.extend_from_json(doc).unwrap();
+    for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+        let object = result.unwrap();
+        documents_builder.append_json_object(&object).unwrap();
     }
 
-    documents_builder.finish().unwrap();
-
-    cursor.set_position(0);
+    let vector = documents_builder.into_inner().unwrap();
 
     // index documents
-    let content = DocumentBatchReader::from_reader(cursor).unwrap();
+    let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
     builder.add_documents(content).unwrap();
     builder.execute().unwrap();
 
diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs
index 893d7c30a..89a6a6eec 100644
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@@ -5,7 +5,7 @@ use big_s::S;
 use heed::EnvOpenOptions;
 use itertools::Itertools;
 use maplit::hashset;
-use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
+use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult};
 use rand::Rng;
@@ -393,8 +393,7 @@ fn criteria_ascdesc() {
     let mut builder =
         IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
 
-    let mut cursor = Cursor::new(Vec::new());
-    let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+    let mut batch_builder = DocumentsBatchBuilder::new(Vec::new());
 
     (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| {
         let mut rng = rand::thread_rng();
@@ -412,16 +411,17 @@ fn criteria_ascdesc() {
             "age": age,
         });
 
-        let json = Cursor::new(serde_json::to_vec(&json).unwrap());
-        batch_builder.extend_from_json(json).unwrap();
+        let object = match json {
+            serde_json::Value::Object(object) => object,
+            _ => panic!(),
+        };
+
+        batch_builder.append_json_object(&object).unwrap();
     });
 
-    batch_builder.finish().unwrap();
-
-    cursor.set_position(0);
-
-    let reader = DocumentBatchReader::from_reader(cursor).unwrap();
+    let vector = batch_builder.into_inner().unwrap();
 
+    let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
     builder.add_documents(reader).unwrap();
     builder.execute().unwrap();
 
diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs
index 9a7986c5e..63bf22579 100644
--- a/milli/tests/search/typo_tolerance.rs
+++ b/milli/tests/search/typo_tolerance.rs
@@ -106,26 +106,23 @@ fn test_typo_disabled_on_word() {
     options.map_size(4096 * 100);
     let index = Index::new(options, tmp.path()).unwrap();
 
-    let documents = json!([
-        {
-            "id": 1usize,
-            "data": "zealand",
-        },
-        {
-            "id": 2usize,
-            "data": "zearand",
-        },
-    ]);
+    let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new());
+    let doc1 = json!({
+        "id": 1usize,
+        "data": "zealand",
+    });
 
-    let mut writer = std::io::Cursor::new(Vec::new());
-    let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
-    let documents = serde_json::to_vec(&documents).unwrap();
-    builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
-    builder.finish().unwrap();
+    let doc2 = json!({
+        "id": 2usize,
+        "data": "zearand",
+    });
 
-    writer.set_position(0);
+    builder.append_json_object(doc1.as_object().unwrap()).unwrap();
+    builder.append_json_object(doc2.as_object().unwrap()).unwrap();
+    let vector = builder.into_inner().unwrap();
 
-    let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap();
+    let documents =
+        milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap();
 
     let mut txn = index.write_txn().unwrap();
     let config = IndexerConfig::default();

From 6d0498df2445975d8cfff3cb8c4b1453bd5e00e6 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 16:22:59 +0200
Subject: [PATCH 05/35] Fix the fuzz tests

---
 milli/fuzz/fuzz_targets/indexing.rs | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs
index b618aabad..5c3b79ed7 100644
--- a/milli/fuzz/fuzz_targets/indexing.rs
+++ b/milli/fuzz/fuzz_targets/indexing.rs
@@ -7,10 +7,10 @@ use anyhow::{bail, Result};
 use arbitrary_json::ArbitraryValue;
 use heed::EnvOpenOptions;
 use libfuzzer_sys::fuzz_target;
-use milli::documents::{DocumentBatchBuilder, DocumentBatchReader};
+use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
 use milli::Index;
-use serde_json::Value;
+use serde_json::{Map, Value};
 
 #[cfg(target_os = "linux")]
 #[global_allocator]
@@ -19,21 +19,26 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
 /// reads json from input and write an obkv batch to writer.
 pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
     let writer = BufWriter::new(writer);
-    let mut builder = DocumentBatchBuilder::new(writer)?;
-    builder.extend_from_json(input)?;
+    let mut builder = DocumentsBatchBuilder::new(writer);
 
-    if builder.len() == 0 {
+    let values: Vec<Map<String, Value>> = serde_json::from_reader(input)?;
+    if builder.documents_count() == 0 {
         bail!("Empty payload");
     }
 
-    let count = builder.finish()?;
+    for object in values {
+        builder.append_json_object(&object)?;
+    }
 
-    Ok(count)
+    let count = builder.documents_count();
+    let vector = builder.into_inner()?;
+
+    Ok(count as usize)
 }
 
 fn index_documents(
     index: &mut milli::Index,
-    documents: DocumentBatchReader<Cursor<Vec<u8>>>,
+    documents: DocumentsBatchReader<Cursor<Vec<u8>>>,
 ) -> Result<()> {
     let config = IndexerConfig::default();
     let mut wtxn = index.write_txn()?;
@@ -98,7 +103,7 @@ fuzz_target!(|batches: Vec<Vec<ArbitraryValue>>| {
             // We ignore all malformed documents
             if let Ok(_) = read_json(json.as_bytes(), &mut documents) {
                 documents.rewind().unwrap();
-                let documents = DocumentBatchReader::from_reader(documents).unwrap();
+                let documents = DocumentsBatchReader::from_reader(documents).unwrap();
                 // A lot of errors can come out of milli and we don't know which ones are normal or not
                 // so we are only going to look for the unexpected panics.
                 let _ = index_documents(&mut index, documents);

From a4ceef96246e8e121007c16a0d159cd4c99ede11 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 16:35:59 +0200
Subject: [PATCH 06/35] Fix the cli for the new DocumentsBatchBuilder/Reader
 structs

---
 cli/src/main.rs | 39 ++++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/cli/src/main.rs b/cli/src/main.rs
index 14bc797af..dcd0f407a 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -8,6 +8,7 @@ use std::time::Instant;
 use byte_unit::Byte;
 use eyre::Result;
 use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
+use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::UpdateIndexingStep::{
     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
 };
@@ -225,9 +226,9 @@ impl Performer for DocumentAddition {
             DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?,
         };
 
-        let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?;
+        let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?;
 
-        println!("Adding {} documents to the index.", reader.len());
+        println!("Adding {} documents to the index.", reader.documents_count());
 
         let mut txn = index.write_txn()?;
         let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() };
@@ -321,35 +322,35 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa
 }
 
 fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    let reader = BufReader::new(reader);
 
-    let mut buf = String::new();
-    let mut reader = BufReader::new(reader);
-
-    while reader.read_line(&mut buf)? > 0 {
-        documents.extend_from_json(&mut buf.as_bytes())?;
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+        let object = result?;
+        documents.append_json_object(&object)?;
     }
-    documents.finish()?;
 
-    Ok(writer.into_inner())
+    documents.into_inner().map_err(Into::into)
 }
 
 fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    let list: Vec<Map<String, Value>> = serde_json::from_reader(reader)?;
 
-    documents.extend_from_json(reader)?;
-    documents.finish()?;
+    for object in list {
+        documents.append_json_object(&object)?;
+    }
 
-    Ok(writer.into_inner())
+    documents.into_inner().map_err(Into::into)
 }
 
 fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
+    let csv = csv::Reader::from_reader(reader);
 
-    Ok(writer.into_inner())
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    documents.append_csv(csv)?;
+
+    documents.into_inner().map_err(Into::into)
 }
 
 #[derive(Debug, StructOpt)]

From f29114f94aff42592d8f0a932784f578433e0744 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 17:21:54 +0200
Subject: [PATCH 07/35] Fix http-ui to fit with the new
 DocumentsBatchBuilder/Reader structs

---
 http-ui/src/main.rs | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index ce4fa7ba5..63b9ee5e0 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -3,7 +3,7 @@ mod update_store;
 use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fmt::Display;
 use std::fs::{create_dir_all, File};
-use std::io::{BufRead, BufReader, Cursor, Read};
+use std::io::{BufReader, Cursor, Read};
 use std::net::SocketAddr;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::PathBuf;
@@ -18,7 +18,7 @@ use either::Either;
 use flate2::read::GzDecoder;
 use futures::{stream, FutureExt, StreamExt};
 use heed::EnvOpenOptions;
-use milli::documents::DocumentBatchReader;
+use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::tokenizer::TokenizerBuilder;
 use milli::update::UpdateIndexingStep::*;
 use milli::update::{
@@ -399,7 +399,7 @@ async fn main() -> anyhow::Result<()> {
                         otherwise => panic!("invalid update format {:?}", otherwise),
                     };
 
-                    let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?;
+                    let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?;
 
                     builder.add_documents(documents)?;
 
@@ -1032,35 +1032,36 @@ async fn main() -> anyhow::Result<()> {
     Ok(())
 }
 
-fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    let reader = BufReader::new(reader);
 
-    for result in BufReader::new(reader).lines() {
-        let line = result?;
-        documents.extend_from_json(Cursor::new(line))?;
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+        let object = result?;
+        documents.append_json_object(&object)?;
     }
 
-    documents.finish()?;
-
-    Ok(writer.into_inner())
+    documents.into_inner().map_err(Into::into)
 }
 
-fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    let list: Vec<Map<String, Value>> = serde_json::from_reader(reader)?;
 
-    documents.extend_from_json(reader)?;
-    documents.finish()?;
+    for object in list {
+        documents.append_json_object(&object)?;
+    }
 
-    Ok(writer.into_inner())
+    documents.into_inner().map_err(Into::into)
 }
 
-fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
+fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> {
+    let csv = csv::Reader::from_reader(reader);
 
-    Ok(writer.into_inner())
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    documents.append_csv(csv)?;
+
+    documents.into_inner().map_err(Into::into)
 }
 
 #[cfg(test)]

From a97d4d63b9f1f29edac7f78aa33710682996e63c Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 18:17:48 +0200
Subject: [PATCH 08/35] Fix the benchmarks

---
 benchmarks/benches/utils.rs | 41 ++++++++++++++++++-------------------
 1 file changed, 20 insertions(+), 21 deletions(-)

diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs
index b2a9966a2..091b9b0f5 100644
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -1,13 +1,13 @@
 #![allow(dead_code)]
 
 use std::fs::{create_dir_all, remove_dir_all, File};
-use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
+use std::io::{self, BufReader, Cursor, Read, Seek};
 use std::num::ParseFloatError;
 use std::path::Path;
 
 use criterion::BenchmarkId;
 use heed::EnvOpenOptions;
-use milli::documents::DocumentBatchReader;
+use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{
     IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
 };
@@ -150,39 +150,38 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
         "jsonl" => documents_from_jsonl(reader).unwrap(),
         otherwise => panic!("invalid update format {:?}", otherwise),
     };
-    DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
+    DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap()
 }
 
-fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
 
-    let mut buf = String::new();
-
-    while reader.read_line(&mut buf)? > 0 {
-        documents.extend_from_json(&mut buf.as_bytes())?;
-        buf.clear();
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+        let object = result?;
+        documents.append_json_object(&object)?;
     }
-    documents.finish()?;
 
-    Ok(writer.into_inner())
+    documents.into_inner().map_err(Into::into)
 }
 
 fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    let list: Vec<Map<String, Value>> = serde_json::from_reader(reader)?;
 
-    documents.extend_from_json(reader)?;
-    documents.finish()?;
+    for object in list {
+        documents.append_json_object(&object)?;
+    }
 
-    Ok(writer.into_inner())
+    documents.into_inner().map_err(Into::into)
 }
 
 fn documents_from_csv(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
-    let mut writer = Cursor::new(Vec::new());
-    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
+    let csv = csv::Reader::from_reader(reader);
 
-    Ok(writer.into_inner())
+    let mut documents = DocumentsBatchBuilder::new(Vec::new());
+    documents.append_csv(csv)?;
+
+    documents.into_inner().map_err(Into::into)
 }
 
 enum AllowedType {

From bdc426388379e80e03d8291d3a12b2a6361f0571 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 18:12:15 +0200
Subject: [PATCH 09/35] Introduce the validate_documents_batch function

---
 milli/src/update/index_documents/transform.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 129357075..42187fc1e 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -13,7 +13,7 @@ use serde_json::{Map, Value};
 use smartstring::SmartString;
 
 use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
-use super::{IndexDocumentsMethod, IndexerConfig};
+use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::db_name;

From cefffde9af704bc9b4146206a852972745941eda Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 18:07:28 +0200
Subject: [PATCH 10/35] Improve the .gitignore of the fuzz crate

---
 milli/fuzz/.gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore
index cb73742e4..ebf2c9395 100644
--- a/milli/fuzz/.gitignore
+++ b/milli/fuzz/.gitignore
@@ -1,2 +1,5 @@
+Cargo.lock
+target/
+
 /corpus/
 /artifacts/

From 0146175fe60e104519ebd23fa2e14bd1ff3e8bfc Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 14 Jun 2022 18:12:15 +0200
Subject: [PATCH 11/35] Introduce the validate_documents_batch function

---
 milli/src/documents/builder.rs                |  36 ++---
 milli/src/documents/mod.rs                    |  28 +++-
 milli/src/error.rs                            |   6 +
 .../extract/extract_geo_points.rs             |   9 +-
 milli/src/update/index_documents/mod.rs       |  36 +++--
 milli/src/update/index_documents/transform.rs |  26 +---
 milli/src/update/index_documents/validate.rs  | 140 ++++++++++++++++++
 7 files changed, 208 insertions(+), 73 deletions(-)
 create mode 100644 milli/src/update/index_documents/validate.rs

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 19cc1ce53..15a22090a 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -180,24 +180,10 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) {
 mod test {
     use std::io::Cursor;
 
-    use serde_json::{json, Map};
+    use serde_json::json;
 
     use super::*;
-    use crate::documents::DocumentsBatchReader;
-    use crate::FieldId;
-
-    fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
-        let mut map = Map::new();
-
-        for (fid, value) in obkv.iter() {
-            let field_name = index.name(fid).unwrap().clone();
-            let value: Value = serde_json::from_slice(value).unwrap();
-
-            map.insert(field_name.to_string(), value);
-        }
-
-        Value::Object(map)
-    }
+    use crate::documents::{obkv_to_object, DocumentsBatchReader};
 
     #[test]
     fn add_single_documents_json() {
@@ -272,7 +258,7 @@ mod test {
             DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
         let index = cursor.documents_batch_index().clone();
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -301,7 +287,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -328,7 +314,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -355,7 +341,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -382,7 +368,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -409,7 +395,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -436,7 +422,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -463,7 +449,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
@@ -507,7 +493,7 @@ mod test {
         let index = cursor.documents_batch_index().clone();
 
         let doc = cursor.next_document().unwrap().unwrap();
-        let val = obkv_to_value(&doc, &index);
+        let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
         assert_eq!(
             val,
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 7a34ae13b..ee3593bf8 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -6,15 +6,30 @@ use std::io;
 
 use bimap::BiHashMap;
 pub use builder::DocumentsBatchBuilder;
+use obkv::KvReader;
 pub use reader::{DocumentsBatchCursor, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
 
-use crate::FieldId;
+use crate::error::{FieldIdMapMissingEntry, InternalError};
+use crate::{FieldId, Object, Result};
 
 /// The key that is used to store the `DocumentsBatchIndex` datastructure,
 /// it is the absolute last key of the list.
 const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();
 
+/// Helper function to convert an obkv reader into a JSON object.
+pub fn obkv_to_object(obkv: &KvReader<FieldId>, index: &DocumentsBatchIndex) -> Result<Object> {
+    obkv.iter()
+        .map(|(field_id, value)| {
+            let field_name = index.name(field_id).ok_or_else(|| {
+                FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" }
+            })?;
+            let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?;
+            Ok((field_name.to_string(), value))
+        })
+        .collect()
+}
+
 /// A bidirectional map that links field ids to their name in a document batch.
 #[derive(Default, Clone, Debug, Serialize, Deserialize)]
 pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
@@ -48,11 +63,12 @@ impl DocumentsBatchIndex {
         self.0.get_by_left(&id).map(AsRef::as_ref)
     }
 
-    pub fn recreate_json(
-        &self,
-        document: &obkv::KvReaderU16,
-    ) -> Result<serde_json::Map<String, serde_json::Value>, crate::Error> {
-        let mut map = serde_json::Map::new();
+    pub fn id(&self, name: &str) -> Option<FieldId> {
+        self.0.get_by_right(name).cloned()
+    }
+
+    pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result<Object> {
+        let mut map = Object::new();
 
         for (k, v) in document.iter() {
             // TODO: TAMO: update the error type
diff --git a/milli/src/error.rs b/milli/src/error.rs
index 57ae1c85a..d34130210 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -141,10 +141,16 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
 
 #[derive(Error, Debug)]
 pub enum GeoError {
+    #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")]
+    NotAnObject { document_id: Value, value: Value },
+    #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")]
+    MissingLatitudeAndLongitude { document_id: Value },
     #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")]
     MissingLatitude { document_id: Value },
     #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")]
     MissingLongitude { document_id: Value },
+    #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")]
+    BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value },
     #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")]
     BadLatitude { document_id: Value, value: Value },
     #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")]
diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index fffae5e77..0f804b93b 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -7,6 +7,7 @@ use serde_json::Value;
 
 use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
 use crate::error::GeoError;
+use crate::update::index_documents::extract_float_from_value;
 use crate::{FieldId, InternalError, Result};
 
 /// Extracts the geographical coordinates contained in each document under the `_geo` field.
@@ -61,11 +62,3 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
 
     Ok(writer_into_reader(writer)?)
 }
-
-fn extract_float_from_value(value: Value) -> StdResult<f64, Value> {
-    match value {
-        Value::Number(ref n) => n.as_f64().ok_or(value),
-        Value::String(ref s) => s.parse::<f64>().map_err(|_| value),
-        value => Err(value),
-    }
-}
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 7f6e00b11..2fb7cbcd9 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -2,11 +2,13 @@ mod extract;
 mod helpers;
 mod transform;
 mod typed_chunk;
+mod validate;
 
 use std::collections::HashSet;
 use std::io::{Cursor, Read, Seek};
 use std::iter::FromIterator;
 use std::num::{NonZeroU32, NonZeroUsize};
+use std::result::Result as StdResult;
 
 use crossbeam_channel::{Receiver, Sender};
 use heed::types::Str;
@@ -25,13 +27,19 @@ pub use self::helpers::{
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
-use crate::documents::DocumentsBatchReader;
+use self::validate::validate_documents_batch;
+pub use self::validate::{
+    extract_float_from_value, validate_document_id, validate_document_id_from_json,
+    validate_geo_from_json,
+};
+use crate::documents::{obkv_to_object, DocumentsBatchReader};
+use crate::error::UserError;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
 use crate::update::{
     self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids,
     WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst,
 };
-use crate::{Index, Result, RoaringBitmapCodec, UserError};
+use crate::{Index, Result, RoaringBitmapCodec};
 
 static MERGED_DATABASE_COUNT: usize = 7;
 static PREFIX_DATABASE_COUNT: usize = 5;
@@ -117,19 +125,27 @@ where
 
     /// Adds a batch of documents to the current builder.
     ///
-    /// Since the documents are progressively added to the writer, a failure will cause a stale
-    /// builder, and the builder must be discarded.
+    /// Since the documents are progressively added to the writer, a failure will cause only
+    /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward.
     ///
     /// Returns the number of documents added to the builder.
-    pub fn add_documents<R>(&mut self, reader: DocumentsBatchReader<R>) -> Result<u64>
-    where
-        R: Read + Seek,
-    {
+    pub fn add_documents<R: Read + Seek>(
+        mut self,
+        reader: DocumentsBatchReader<R>,
+    ) -> Result<(Self, StdResult<u64, UserError>)> {
         // Early return when there is no document to add
         if reader.is_empty() {
-            return Ok(0);
+            return Ok((self, Ok(0)));
         }
 
+        // We check for user errors in this validator and if there is one, we can return
+        // the `IndexDocument` struct as it is valid to send more documents into it.
+        // However, if there is an internal error we throw it away!
+        let reader = match validate_documents_batch(self.wtxn, self.index, reader)? {
+            Ok(reader) => reader,
+            Err(user_error) => return Ok((self, Err(user_error))),
+        };
+
         let indexed_documents = self
             .transform
             .as_mut()
@@ -139,7 +155,7 @@ where
 
         self.added_documents += indexed_documents;
 
-        Ok(indexed_documents)
+        Ok((self, Ok(indexed_documents)))
     }
 
     #[logging_timer::time("IndexDocuments::{}")]
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 42187fc1e..bc7eefd33 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -17,6 +17,7 @@ use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::db_name;
+use crate::update::index_documents::validate_document_id_from_json;
 use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use crate::{
     ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
@@ -782,14 +783,6 @@ fn compute_primary_key_pair(
     }
 }
 
-fn validate_document_id(document_id: &str) -> Option<&str> {
-    let document_id = document_id.trim();
-    Some(document_id).filter(|id| {
-        !id.is_empty()
-            && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
-    })
-}
-
 /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`.
 ///
 /// The size and alignment of T and U must match.
@@ -813,22 +806,7 @@ fn update_primary_key<'a>(
 ) -> Result<Cow<'a, str>> {
     match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
         Some((_, bytes)) => {
-            let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
-                Value::String(string) => match validate_document_id(&string) {
-                    Some(s) if s.len() == string.len() => string,
-                    Some(s) => s.to_string(),
-                    None => {
-                        return Err(UserError::InvalidDocumentId {
-                            document_id: Value::String(string),
-                        }
-                        .into())
-                    }
-                },
-                Value::Number(number) => number.to_string(),
-                content => {
-                    return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into())
-                }
-            };
+            let value = validate_document_id_from_json(bytes)??;
             serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?;
             Ok(Cow::Owned(value))
         }
diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
new file mode 100644
index 000000000..b4c0cb68f
--- /dev/null
+++ b/milli/src/update/index_documents/validate.rs
@@ -0,0 +1,140 @@
+use std::io::{Read, Seek};
+use std::result::Result as StdResult;
+
+use serde_json::Value;
+
+use crate::error::{GeoError, InternalError, UserError};
+use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader};
+use crate::{Index, Result};
+
+/// This function validates a documents by checking that:
+///  - we can infer a primary key,
+///  - all the documents id exist and,
+///  - the validity of them but also,
+///  - the validity of the `_geo` field depending on the settings.
+pub fn validate_documents_batch<R: Read + Seek>(
+    rtxn: &heed::RoTxn,
+    index: &Index,
+    reader: DocumentsBatchReader<R>,
+) -> Result<StdResult<DocumentsBatchReader<R>, UserError>> {
+    let mut cursor = reader.into_cursor();
+    let documents_batch_index = cursor.documents_batch_index().clone();
+
+    // The primary key *field id* that has already been set for this index or the one
+    // we will guess by searching for the first key that contains "id" as a substring.
+    let (primary_key, primary_key_id) = match index.primary_key(rtxn)? {
+        Some(primary_key) => match documents_batch_index.id(primary_key) {
+            Some(id) => (primary_key, id),
+            None => {
+                return match cursor.next_document()? {
+                    Some(first_document) => Ok(Err(UserError::MissingDocumentId {
+                        primary_key: primary_key.to_string(),
+                        document: obkv_to_object(&first_document, &documents_batch_index)?,
+                    })),
+                    // If there is no document in this batch the best we can do is to return this error.
+                    None => Ok(Err(UserError::MissingPrimaryKey)),
+                };
+            }
+        },
+        None => {
+            let guessed = documents_batch_index
+                .iter()
+                .filter(|(_, name)| name.contains("id"))
+                .min_by_key(|(fid, _)| *fid);
+            match guessed {
+                Some((id, name)) => (name.as_str(), *id),
+                None => return Ok(Err(UserError::MissingPrimaryKey)),
+            }
+        }
+    };
+
+    // If the settings specifies that a _geo field must be used therefore we must check the
+    // validity of it in all the documents of this batch and this is when we return `Some`.
+    let geo_field_id = match documents_batch_index.id("_geo") {
+        Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id),
+        _otherwise => None,
+    };
+
+    while let Some(document) = cursor.next_document()? {
+        let document_id = match document.get(primary_key_id) {
+            Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? {
+                Ok(document_id) => document_id,
+                Err(user_error) => return Ok(Err(user_error)),
+            },
+            None => {
+                return Ok(Err(UserError::MissingDocumentId {
+                    primary_key: primary_key.to_string(),
+                    document: obkv_to_object(&document, &documents_batch_index)?,
+                }))
+            }
+        };
+
+        if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
+            if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? {
+                return Ok(Err(UserError::from(user_error)));
+            }
+        }
+    }
+
+    Ok(Ok(cursor.into_reader()))
+}
+
+/// Returns a trimmed version of the document id or `None` if it is invalid.
+pub fn validate_document_id(document_id: &str) -> Option<&str> {
+    let id = document_id.trim();
+    if !id.is_empty()
+        && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
+    {
+        Some(id)
+    } else {
+        None
+    }
+}
+
+/// Parses a Json encoded document id and validate it, returning a user error when it is one.
+pub fn validate_document_id_from_json(bytes: &[u8]) -> Result<StdResult<String, UserError>> {
+    match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
+        Value::String(string) => match validate_document_id(&string) {
+            Some(s) if s.len() == string.len() => Ok(Ok(string)),
+            Some(s) => Ok(Ok(s.to_string())),
+            None => {
+                return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) }))
+            }
+        },
+        Value::Number(number) => Ok(Ok(number.to_string())),
+        content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })),
+    }
+}
+
+/// Try to extract an `f64` from a JSON `Value` and return the `Value`
+/// in the `Err` variant if it failed.
+pub fn extract_float_from_value(value: Value) -> StdResult<f64, Value> {
+    match value {
+        Value::Number(ref n) => n.as_f64().ok_or(value),
+        Value::String(ref s) => s.parse::<f64>().map_err(|_| value),
+        value => Err(value),
+    }
+}
+
+pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
+    let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
+        Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
+            (Some(lat), Some(lng)) => {
+                match (extract_float_from_value(lat), extract_float_from_value(lng)) {
+                    (Ok(_), Ok(_)) => Ok(()),
+                    (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }),
+                    (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }),
+                    (Err(lat), Err(lng)) => {
+                        Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })
+                    }
+                }
+            }
+            (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }),
+            (Some(_), None) => Err(GeoError::MissingLongitude { document_id }),
+            (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }),
+        },
+        value => Err(GeoError::NotAnObject { document_id, value }),
+    };
+
+    Ok(result)
+}

From fcfc4caf8c4512f9dce768017d95ac91804a90d2 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 15 Jun 2022 15:36:27 +0200
Subject: [PATCH 12/35] Move the Object type in the lib.rs file and use it
 everywhere

---
 benchmarks/benches/utils.rs                   | 20 ++++++++----------
 cli/src/main.rs                               |  9 ++++----
 http-ui/src/main.rs                           | 16 ++++++--------
 milli/fuzz/fuzz_targets/indexing.rs           |  2 +-
 milli/src/documents/builder.rs                |  5 +++--
 milli/src/error.rs                            |  6 ++----
 milli/src/lib.rs                              | 21 ++++++++++---------
 .../extract/extract_geo_points.rs             |  1 -
 milli/src/update/index_documents/transform.rs |  2 +-
 milli/tests/search/facet_distribution.rs      |  6 +++---
 milli/tests/search/mod.rs                     |  6 +++---
 11 files changed, 43 insertions(+), 51 deletions(-)

diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs
index 091b9b0f5..630e17943 100644
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -11,8 +11,8 @@ use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{
     IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings,
 };
-use milli::{Filter, Index};
-use serde_json::{Map, Value};
+use milli::{Filter, Index, Object};
+use serde_json::Value;
 
 pub struct Conf<'a> {
     /// where we are going to create our database.mmdb directory
@@ -96,12 +96,10 @@ pub fn base_setup(conf: &Conf) -> Index {
         update_method: IndexDocumentsMethod::ReplaceDocuments,
         ..Default::default()
     };
-    let mut builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
+    let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
     let documents = documents_from(conf.dataset, conf.dataset_format);
-
-    builder.add_documents(documents).unwrap();
-
+    let (builder, user_error) = builder.add_documents(documents).unwrap();
+    user_error.unwrap();
     builder.execute().unwrap();
     wtxn.commit().unwrap();
 
@@ -156,7 +154,7 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
 fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
 
-    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
         let object = result?;
         documents.append_json_object(&object)?;
     }
@@ -166,7 +164,7 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
 
 fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
-    let list: Vec<Map<String, Value>> = serde_json::from_reader(reader)?;
+    let list: Vec<Object> = serde_json::from_reader(reader)?;
 
     for object in list {
         documents.append_json_object(&object)?;
@@ -221,14 +219,14 @@ impl<R: Read> CSVDocumentDeserializer<R> {
 }
 
 impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
-    type Item = anyhow::Result<Map<String, Value>>;
+    type Item = anyhow::Result<Object>;
 
     fn next(&mut self) -> Option<Self::Item> {
         let csv_document = self.documents.next()?;
 
         match csv_document {
             Ok(csv_document) => {
-                let mut document = Map::new();
+                let mut document = Object::new();
 
                 for ((field_name, field_type), value) in
                     self.headers.iter().zip(csv_document.into_iter())
diff --git a/cli/src/main.rs b/cli/src/main.rs
index dcd0f407a..db4ca91ab 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -13,8 +13,7 @@ use milli::update::UpdateIndexingStep::{
     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
 };
 use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig};
-use milli::Index;
-use serde_json::{Map, Value};
+use milli::{Index, Object};
 use structopt::StructOpt;
 
 #[cfg(target_os = "linux")]
@@ -325,7 +324,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
     let reader = BufReader::new(reader);
 
-    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
         let object = result?;
         documents.append_json_object(&object)?;
     }
@@ -335,7 +334,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
 
 fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
-    let list: Vec<Map<String, Value>> = serde_json::from_reader(reader)?;
+    let list: Vec<Object> = serde_json::from_reader(reader)?;
 
     for object in list {
         documents.append_json_object(&object)?;
@@ -424,7 +423,7 @@ impl Search {
         filter: &Option<String>,
         offset: &Option<usize>,
         limit: &Option<usize>,
-    ) -> Result<Vec<Map<String, Value>>> {
+    ) -> Result<Vec<Object>> {
         let txn = index.read_txn()?;
         let mut search = index.search(&txn);
 
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 63b9ee5e0..8167076c6 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -26,11 +26,11 @@ use milli::update::{
 };
 use milli::{
     obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index,
-    MatcherBuilder, SearchResult, SortError,
+    MatcherBuilder, Object, SearchResult, SortError,
 };
 use once_cell::sync::OnceCell;
 use serde::{Deserialize, Serialize};
-use serde_json::{Map, Value};
+use serde_json::Value;
 use structopt::StructOpt;
 use tokio::fs::File as TFile;
 use tokio::io::AsyncWriteExt;
@@ -169,11 +169,7 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> {
         }
     }
 
-    fn highlight_record(
-        &self,
-        object: &mut Map<String, Value>,
-        attributes_to_highlight: &HashSet<String>,
-    ) {
+    fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet<String>) {
         // TODO do we need to create a string for element that are not and needs to be highlight?
         for (key, value) in object.iter_mut() {
             if attributes_to_highlight.contains(key) {
@@ -708,7 +704,7 @@ async fn main() -> anyhow::Result<()> {
     #[derive(Debug, Serialize)]
     #[serde(rename_all = "camelCase")]
     struct Answer {
-        documents: Vec<Map<String, Value>>,
+        documents: Vec<Object>,
         number_of_candidates: u64,
         facets: BTreeMap<String, BTreeMap<String, u64>>,
     }
@@ -1036,7 +1032,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
     let reader = BufReader::new(reader);
 
-    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+    for result in serde_json::Deserializer::from_reader(reader).into_iter::<Object>() {
         let object = result?;
         documents.append_json_object(&object)?;
     }
@@ -1046,7 +1042,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
 
 fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
-    let list: Vec<Map<String, Value>> = serde_json::from_reader(reader)?;
+    let list: Vec<Object> = serde_json::from_reader(reader)?;
 
     for object in list {
         documents.append_json_object(&object)?;
diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs
index 5c3b79ed7..e4f42655e 100644
--- a/milli/fuzz/fuzz_targets/indexing.rs
+++ b/milli/fuzz/fuzz_targets/indexing.rs
@@ -21,7 +21,7 @@ pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result<usize> {
     let writer = BufWriter::new(writer);
     let mut builder = DocumentsBatchBuilder::new(writer);
 
-    let values: Vec<Map<String, Value>> = serde_json::from_reader(input)?;
+    let values: Vec<Object> = serde_json::from_reader(input)?;
     if builder.documents_count() == 0 {
         bail!("Empty payload");
     }
diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 15a22090a..589e52269 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -1,9 +1,10 @@
 use std::io::{self, Write};
 
 use grenad::{CompressionType, WriterBuilder};
-use serde_json::{to_writer, Map, Value};
+use serde_json::{to_writer, Value};
 
 use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
+use crate::Object;
 
 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
 /// format used by milli.
@@ -55,7 +56,7 @@ impl<W: Write> DocumentsBatchBuilder<W> {
     }
 
     /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly.
-    pub fn append_json_object(&mut self, object: &Map<String, Value>) -> io::Result<()> {
+    pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> {
         // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
         let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect();
         fields_ids.sort_unstable();
diff --git a/milli/src/error.rs b/milli/src/error.rs
index d34130210..a23472951 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -4,12 +4,10 @@ use std::{io, str};
 
 use heed::{Error as HeedError, MdbError};
 use rayon::ThreadPoolBuildError;
-use serde_json::{Map, Value};
+use serde_json::Value;
 use thiserror::Error;
 
-use crate::{CriterionError, DocumentId, FieldId, SortError};
-
-pub type Object = Map<String, Value>;
+use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
 
 pub fn is_reserved_keyword(keyword: &str) -> bool {
     ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword)
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index 81cd057d5..a7be87183 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -20,7 +20,7 @@ use std::hash::BuildHasherDefault;
 pub use filter_parser::{Condition, FilterCondition};
 use fxhash::{FxHasher32, FxHasher64};
 pub use grenad::CompressionType;
-use serde_json::{Map, Value};
+use serde_json::Value;
 pub use {charabia as tokenizer, heed};
 
 pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError};
@@ -43,20 +43,21 @@ pub use self::search::{
 
 pub type Result<T> = std::result::Result<T, error::Error>;
 
+pub type Attribute = u32;
+pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
+pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
+pub type DocumentId = u32;
 pub type FastMap4<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher32>>;
 pub type FastMap8<K, V> = HashMap<K, V, BuildHasherDefault<FxHasher64>>;
+pub type FieldDistribution = BTreeMap<String, u64>;
+pub type FieldId = u16;
+pub type Object = serde_json::Map<String, serde_json::Value>;
+pub type Position = u32;
+pub type RelativePosition = u16;
 pub type SmallString32 = smallstr::SmallString<[u8; 32]>;
 pub type SmallVec16<T> = smallvec::SmallVec<[T; 16]>;
 pub type SmallVec32<T> = smallvec::SmallVec<[T; 32]>;
 pub type SmallVec8<T> = smallvec::SmallVec<[T; 8]>;
-pub type BEU32 = heed::zerocopy::U32<heed::byteorder::BE>;
-pub type BEU64 = heed::zerocopy::U64<heed::byteorder::BE>;
-pub type Attribute = u32;
-pub type DocumentId = u32;
-pub type FieldId = u16;
-pub type Position = u32;
-pub type RelativePosition = u16;
-pub type FieldDistribution = BTreeMap<String, u64>;
 
 /// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata
 /// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point
@@ -82,7 +83,7 @@ pub fn obkv_to_json(
     displayed_fields: &[FieldId],
     fields_ids_map: &FieldsIdsMap,
     obkv: obkv::KvReaderU16,
-) -> Result<Map<String, Value>> {
+) -> Result<Object> {
     displayed_fields
         .iter()
         .copied()
diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index 0f804b93b..46ef9ba9b 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -1,6 +1,5 @@
 use std::fs::File;
 use std::io;
-use std::result::Result as StdResult;
 
 use concat_arrays::concat_arrays;
 use serde_json::Value;
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index bc7eefd33..4ece58509 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -13,7 +13,7 @@ use serde_json::{Map, Value};
 use smartstring::SmartString;
 
 use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
-use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig};
+use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::db_name;
diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs
index 66713de1e..8890285e7 100644
--- a/milli/tests/search/facet_distribution.rs
+++ b/milli/tests/search/facet_distribution.rs
@@ -5,8 +5,8 @@ use heed::EnvOpenOptions;
 use maplit::hashset;
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
-use milli::{FacetDistribution, Index};
-use serde_json::{Deserializer, Map, Value};
+use milli::{FacetDistribution, Index, Object};
+use serde_json::Deserializer;
 
 #[test]
 fn test_facet_distribution_with_no_facet_values() {
@@ -46,7 +46,7 @@ fn test_facet_distribution_with_no_facet_values() {
         }"#,
     );
 
-    for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+    for result in Deserializer::from_reader(reader).into_iter::<Object>() {
         let object = result.unwrap();
         documents_builder.append_json_object(&object).unwrap();
     }
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index 4cf117dc7..0b6ce80cc 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -8,9 +8,9 @@ use heed::EnvOpenOptions;
 use maplit::{hashmap, hashset};
 use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader};
 use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings};
-use milli::{AscDesc, Criterion, DocumentId, Index, Member};
+use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object};
 use serde::Deserialize;
-use serde_json::{Deserializer, Map, Value};
+use serde_json::Deserializer;
 use slice_group_by::GroupBy;
 
 mod distinct;
@@ -66,7 +66,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
     let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
     let reader = Cursor::new(CONTENT.as_bytes());
 
-    for result in Deserializer::from_reader(reader).into_iter::<Map<String, Value>>() {
+    for result in Deserializer::from_reader(reader).into_iter::<Object>() {
         let object = result.unwrap();
         documents_builder.append_json_object(&object).unwrap();
     }

From 399eec5c0101780f74b9794957865c3fe8b3d519 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 15 Jun 2022 14:35:19 +0200
Subject: [PATCH 13/35] Fix the indexation tests

---
 benchmarks/benches/indexing.rs               | 110 ++++++----
 cli/src/main.rs                              |   7 +-
 http-ui/src/main.rs                          |   6 +-
 milli/src/index.rs                           |  25 ++-
 milli/src/search/distinct/mod.rs             |   5 +-
 milli/src/search/facet/filter.rs             |   5 +-
 milli/src/update/clear_documents.rs          |   5 +-
 milli/src/update/delete_documents.rs         |  18 +-
 milli/src/update/index_documents/mod.rs      | 205 +++++++++++--------
 milli/src/update/index_documents/validate.rs |   1 +
 milli/src/update/settings.rs                 |  70 ++++---
 milli/tests/search/facet_distribution.rs     |   6 +-
 milli/tests/search/mod.rs                    |   6 +-
 milli/tests/search/query_criteria.rs         |   6 +-
 milli/tests/search/typo_tolerance.rs         |   7 +-
 15 files changed, 288 insertions(+), 194 deletions(-)

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index 3ae0a1a84..1b501b21a 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -132,12 +132,13 @@ fn indexing_songs_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -223,11 +224,12 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -279,11 +281,12 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -294,19 +297,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -339,13 +344,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
 
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -377,12 +383,13 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -415,12 +422,13 @@ fn indexing_wiki(c: &mut Criterion) {
                 let indexing_config =
                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -507,11 +515,12 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) {
                 let mut wtxn = index.write_txn().unwrap();
                 let indexing_config =
                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -564,12 +573,13 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config =
                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents =
                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -581,24 +591,26 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) {
                 let indexing_config =
                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents =
                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 let indexing_config =
                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents =
                     utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -631,12 +643,13 @@ fn indexing_movies_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::MOVIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -720,11 +733,12 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::MOVIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -775,12 +789,13 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
                 // as we don't care about the time it takes.
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -791,21 +806,23 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -861,12 +878,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -922,11 +940,12 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -984,12 +1003,13 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -1021,12 +1041,13 @@ fn indexing_geo(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -1113,11 +1134,12 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
                 let indexing_config = IndexDocumentsConfig::default();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
diff --git a/cli/src/main.rs b/cli/src/main.rs
index db4ca91ab..0d197af17 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -255,7 +255,7 @@ impl Performer for DocumentAddition {
             let bar = progesses.add(bar);
             bars.push(bar);
         }
-        let mut addition = milli::update::IndexDocuments::new(
+        let addition = milli::update::IndexDocuments::new(
             &mut txn,
             &index,
             &config,
@@ -263,7 +263,10 @@ impl Performer for DocumentAddition {
             |step| indexing_callback(step, &bars),
         )
         .unwrap();
-        addition.add_documents(reader)?;
+        let (addition, user_error) = addition.add_documents(reader)?;
+        if let Err(error) = user_error {
+            return Err(error.into());
+        }
 
         std::thread::spawn(move || {
             progesses.join().unwrap();
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 8167076c6..117aa31e8 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -374,7 +374,7 @@ async fn main() -> anyhow::Result<()> {
                         });
                     };
 
-                    let mut builder = milli::update::IndexDocuments::new(
+                    let builder = milli::update::IndexDocuments::new(
                         &mut wtxn,
                         &index_cloned,
                         GLOBAL_CONFIG.get().unwrap(),
@@ -397,8 +397,8 @@ async fn main() -> anyhow::Result<()> {
 
                     let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?;
 
-                    builder.add_documents(documents)?;
-
+                    let (builder, user_error) = builder.add_documents(documents)?;
+                    let _count = user_error?;
                     let result = builder.execute();
 
                     match result {
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 9637b4103..272877912 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -1212,10 +1212,11 @@ pub(crate) mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1234,7 +1235,7 @@ pub(crate) mod tests {
         // we add all the documents a second time. we are supposed to get the same
         // field_distribution in the end
         let mut wtxn = index.write_txn().unwrap();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
         let content = documents!([
@@ -1242,7 +1243,8 @@ pub(crate) mod tests {
             { "id": 2, "name": "bob", "age": 20 },
             { "id": 2, "name": "bob", "age": 20 },
         ]);
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1265,10 +1267,11 @@ pub(crate) mod tests {
         ]);
 
         let mut wtxn = index.write_txn().unwrap();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1333,10 +1336,11 @@ pub(crate) mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1390,10 +1394,11 @@ pub(crate) mod tests {
         ]);
 
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs
index 670fa01ac..1a9c56cf3 100644
--- a/milli/src/search/distinct/mod.rs
+++ b/milli/src/search/distinct/mod.rs
@@ -97,14 +97,15 @@ mod test {
             update_method: IndexDocumentsMethod::ReplaceDocuments,
             ..Default::default()
         };
-        let mut addition =
+        let addition =
             IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
 
         let reader =
             crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice()))
                 .unwrap();
 
-        addition.add_documents(reader).unwrap();
+        let (addition, user_error) = addition.add_documents(reader).unwrap();
+        user_error.unwrap();
         addition.execute().unwrap();
 
         let fields_map = index.fields_ids_map(&txn).unwrap();
diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs
index d89413f62..41e2f0657 100644
--- a/milli/src/search/facet/filter.rs
+++ b/milli/src/search/facet/filter.rs
@@ -648,10 +648,11 @@ mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs
index d1939df7b..3fe57eeae 100644
--- a/milli/src/update/clear_documents.rs
+++ b/milli/src/update/clear_documents.rs
@@ -100,9 +100,10 @@ mod tests {
         ]);
         let indexing_config = IndexDocumentsConfig::default();
         let config = IndexerConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         // Clear all documents from the database.
diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs
index 3b519c101..49e7de8ae 100644
--- a/milli/src/update/delete_documents.rs
+++ b/milli/src/update/delete_documents.rs
@@ -657,13 +657,13 @@ mod tests {
     fn insert_documents<'t, R: std::io::Read + std::io::Seek>(
         wtxn: &mut RwTxn<'t, '_>,
         index: &'t Index,
-        documents: crate::documents::DocumentBatchReader<R>,
+        documents: crate::documents::DocumentsBatchReader<R>,
     ) {
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
-            IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(documents).unwrap();
+        let builder = IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
     }
 
@@ -701,9 +701,10 @@ mod tests {
         ]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         // delete those documents, ids are synchronous therefore 0, 1, and 2.
@@ -736,9 +737,10 @@ mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         // Delete not all of the documents but some of them.
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 2fb7cbcd9..ae42483f0 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -141,7 +141,12 @@ where
         // We check for user errors in this validator and if there is one, we can return
         // the `IndexDocument` struct as it is valid to send more documents into it.
         // However, if there is an internal error we throw it away!
-        let reader = match validate_documents_batch(self.wtxn, self.index, reader)? {
+        let reader = match validate_documents_batch(
+            self.wtxn,
+            self.index,
+            self.config.autogenerate_docids,
+            reader,
+        )? {
             Ok(reader) => reader,
             Err(user_error) => return Ok((self, Err(user_error))),
         };
@@ -626,10 +631,11 @@ mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -642,10 +648,11 @@ mod tests {
         // Second we send 1 document with id 1, to erase the previous ones.
         let mut wtxn = index.write_txn().unwrap();
         let content = documents!([ { "id": 1, "name": "updated kevin" } ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -662,9 +669,11 @@ mod tests {
             { "id": 2, "name": "updated kevina" },
             { "id": 3, "name": "updated benoit" }
         ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
+        builder.execute().unwrap();
         wtxn.commit().unwrap();
 
         // Check that there is **always** 3 documents.
@@ -694,10 +703,11 @@ mod tests {
             update_method: IndexDocumentsMethod::UpdateDocuments,
             ..Default::default()
         };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -722,9 +732,10 @@ mod tests {
         // Second we send 1 document with id 1, to force it to be merged with the previous one.
         let mut wtxn = index.write_txn().unwrap();
         let content = documents!([ { "id": 1, "age": 25 } ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -765,7 +776,7 @@ mod tests {
         ]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
         assert!(builder.add_documents(content).is_err());
         wtxn.commit().unwrap();
@@ -794,10 +805,11 @@ mod tests {
         let config = IndexerConfig::default();
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -814,9 +826,10 @@ mod tests {
         // Second we send 1 document with the generated uuid, to erase the previous ones.
         let mut wtxn = index.write_txn().unwrap();
         let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -856,9 +869,10 @@ mod tests {
         ]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -873,9 +887,10 @@ mod tests {
         let content = documents!([ { "name": "new kevin" } ]);
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -898,9 +913,10 @@ mod tests {
         let content = documents!([]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -924,7 +940,7 @@ mod tests {
         let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
         assert!(builder.add_documents(content).is_err());
@@ -934,9 +950,10 @@ mod tests {
         let mut wtxn = index.write_txn().unwrap();
         // There is a space in the document id.
         let content = documents!([ { "id": 32, "name": "kevin" } ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -963,9 +980,10 @@ mod tests {
         ]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1009,9 +1027,10 @@ mod tests {
             update_method: IndexDocumentsMethod::ReplaceDocuments,
             ..Default::default()
         };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1020,7 +1039,7 @@ mod tests {
             update_method: IndexDocumentsMethod::UpdateDocuments,
             ..Default::default()
         };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
         let documents = documents!([
           {
@@ -1030,7 +1049,8 @@ mod tests {
           }
         ]);
 
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
     }
@@ -1057,9 +1077,10 @@ mod tests {
             update_method: IndexDocumentsMethod::ReplaceDocuments,
             ..Default::default()
         };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1099,10 +1120,11 @@ mod tests {
           { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" },
           { "id": 3, "_geo.lat": 31, "_geo.lng": "42" },
         ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1138,10 +1160,11 @@ mod tests {
         let documents = documents!([
           { "id": 0, "_geo": { "lng": 42 } }
         ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
@@ -1151,10 +1174,11 @@ mod tests {
         let documents = documents!([
           { "id": 0, "_geo": { "lat": 42 } }
         ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
@@ -1164,10 +1188,11 @@ mod tests {
         let documents = documents!([
           { "id": 0, "_geo": { "lat": "lol", "lng": 42 } }
         ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
@@ -1177,10 +1202,11 @@ mod tests {
         let documents = documents!([
           { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } }
         ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
@@ -1190,10 +1216,11 @@ mod tests {
         let documents = documents!([
           { "id": 0, "_geo": { "lat": 12, "lng": "hello" } }
         ]);
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(documents).unwrap();
+        let (builder, user_error) = builder.add_documents(documents).unwrap();
+        user_error.unwrap();
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
@@ -1217,10 +1244,11 @@ mod tests {
         ]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId"));
@@ -1237,10 +1265,11 @@ mod tests {
             { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
         ]);
 
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         let external_documents_ids = index.external_documents_ids(&wtxn).unwrap();
         assert!(external_documents_ids.get("30").is_some());
@@ -1249,10 +1278,11 @@ mod tests {
             { "objectId": 30,  "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } }
         ]);
 
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
@@ -1281,10 +1311,11 @@ mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
@@ -1315,10 +1346,11 @@ mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
@@ -1373,10 +1405,11 @@ mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
@@ -1426,10 +1459,11 @@ mod tests {
 
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
@@ -1558,10 +1592,11 @@ mod tests {
         ]);
 
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1628,10 +1663,11 @@ mod tests {
         // index the documents
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
@@ -1720,10 +1756,11 @@ mod tests {
         let mut wtxn = index.write_txn().unwrap();
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1737,10 +1774,11 @@ mod tests {
         let mut wtxn = index.write_txn().unwrap();
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1759,10 +1797,11 @@ mod tests {
         let mut wtxn = index.write_txn().unwrap();
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1787,10 +1826,11 @@ mod tests {
         let mut wtxn = index.write_txn().unwrap();
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1832,10 +1872,11 @@ mod tests {
         let mut wtxn = index.write_txn().unwrap();
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
     }
@@ -1870,10 +1911,11 @@ mod tests {
         let mut wtxn = index.write_txn().unwrap();
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         // Create one long document
@@ -1884,10 +1926,11 @@ mod tests {
         // Index this one long document
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         wtxn.commit().unwrap();
@@ -1901,7 +1944,7 @@ mod tests {
         let index = Index::new(options, tmp).unwrap();
         let mut wtxn = index.write_txn().unwrap();
         let indexer_config = IndexerConfig::default();
-        let mut builder = IndexDocuments::new(
+        let builder = IndexDocuments::new(
             &mut wtxn,
             &index,
             &indexer_config,
@@ -1930,8 +1973,10 @@ mod tests {
             "branch_id_number": 0
         }]};
 
-        builder.add_documents(doc1).unwrap();
-        builder.add_documents(doc2).unwrap();
+        let (builder, user_error) = builder.add_documents(doc1).unwrap();
+        user_error.unwrap();
+        let (builder, user_error) = builder.add_documents(doc2).unwrap();
+        user_error.unwrap();
 
         builder.execute().unwrap();
 
diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
index b4c0cb68f..0ed1f1cc0 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/validate.rs
@@ -15,6 +15,7 @@ use crate::{Index, Result};
 pub fn validate_documents_batch<R: Read + Seek>(
     rtxn: &heed::RoTxn,
     index: &Index,
+    autogenerate_docids: bool,
     reader: DocumentsBatchReader<R>,
 ) -> Result<StdResult<DocumentsBatchReader<R>, UserError>> {
     let mut cursor = reader.into_cursor();
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index ccf29eb49..5f39579b7 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -735,10 +735,11 @@ mod tests {
         ]);
         let config = IndexerConfig::default();
         let indexing_config = IndexDocumentsConfig::default();
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -798,10 +799,11 @@ mod tests {
         let config = IndexerConfig::default();
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -850,10 +852,11 @@ mod tests {
         let config = IndexerConfig::default();
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -880,10 +883,11 @@ mod tests {
         let config = IndexerConfig::default();
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         // In the same transaction we change the displayed fields to be only the age.
@@ -934,10 +938,11 @@ mod tests {
         ]);
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -974,10 +979,11 @@ mod tests {
 
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1016,10 +1022,11 @@ mod tests {
         ]);
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1067,10 +1074,11 @@ mod tests {
         ]);
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1110,10 +1118,11 @@ mod tests {
         ]);
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1142,10 +1151,11 @@ mod tests {
         let config = IndexerConfig::default();
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1172,10 +1182,11 @@ mod tests {
         let config = IndexerConfig::default();
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         // In the same transaction we provide some stop_words
@@ -1251,10 +1262,11 @@ mod tests {
         let config = IndexerConfig::default();
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
 
         // In the same transaction provide some synonyms
@@ -1389,10 +1401,11 @@ mod tests {
         ]);
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
@@ -1452,10 +1465,11 @@ mod tests {
         ]);
         let indexing_config =
             IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-        let mut builder =
+        let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        builder.add_documents(content).unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
         builder.execute().unwrap();
         wtxn.commit().unwrap();
 
diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs
index 8890285e7..83d692d7f 100644
--- a/milli/tests/search/facet_distribution.rs
+++ b/milli/tests/search/facet_distribution.rs
@@ -29,8 +29,7 @@ fn test_facet_distribution_with_no_facet_values() {
     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
 
-    let mut builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
+    let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
     let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
     let reader = Cursor::new(
         r#"{
@@ -55,7 +54,8 @@ fn test_facet_distribution_with_no_facet_values() {
 
     // index documents
     let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
-    builder.add_documents(content).unwrap();
+    let (builder, user_error) = builder.add_documents(content).unwrap();
+    user_error.unwrap();
     builder.execute().unwrap();
 
     wtxn.commit().unwrap();
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index 0b6ce80cc..3b8960fcc 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -61,8 +61,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
 
-    let mut builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
+    let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
     let mut documents_builder = DocumentsBatchBuilder::new(Vec::new());
     let reader = Cursor::new(CONTENT.as_bytes());
 
@@ -75,7 +74,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
 
     // index documents
     let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
-    builder.add_documents(content).unwrap();
+    let (builder, user_error) = builder.add_documents(content).unwrap();
+    user_error.unwrap();
     builder.execute().unwrap();
 
     wtxn.commit().unwrap();
diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs
index 89a6a6eec..a96366f5e 100644
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@@ -390,8 +390,7 @@ fn criteria_ascdesc() {
     // index documents
     let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() };
     let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
-    let mut builder =
-        IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
+    let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
 
     let mut batch_builder = DocumentsBatchBuilder::new(Vec::new());
 
@@ -422,7 +421,8 @@ fn criteria_ascdesc() {
     let vector = batch_builder.into_inner().unwrap();
 
     let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap();
-    builder.add_documents(reader).unwrap();
+    let (builder, user_error) = builder.add_documents(reader).unwrap();
+    user_error.unwrap();
     builder.execute().unwrap();
 
     wtxn.commit().unwrap();
diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs
index 63bf22579..7c4cf8971 100644
--- a/milli/tests/search/typo_tolerance.rs
+++ b/milli/tests/search/typo_tolerance.rs
@@ -127,11 +127,10 @@ fn test_typo_disabled_on_word() {
     let mut txn = index.write_txn().unwrap();
     let config = IndexerConfig::default();
     let indexing_config = IndexDocumentsConfig::default();
-    let mut builder =
-        IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
-
-    builder.add_documents(documents).unwrap();
+    let builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap();
 
+    let (builder, user_error) = builder.add_documents(documents).unwrap();
+    user_error.unwrap();
     builder.execute().unwrap();
     txn.commit().unwrap();
 

From 2ceeb51c37a98e8233edab175467e11101ef5dd2 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 15 Jun 2022 15:14:20 +0200
Subject: [PATCH 14/35] Support the auto-generated ids when validating
 documents

---
 milli/src/update/index_documents/mod.rs      |  6 ++++--
 milli/src/update/index_documents/validate.rs | 12 +++++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ae42483f0..1a1fc9a0e 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -778,7 +778,8 @@ mod tests {
         let indexing_config = IndexDocumentsConfig::default();
         let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap();
-        assert!(builder.add_documents(content).is_err());
+        let (_builder, user_error) = builder.add_documents(content).unwrap();
+        assert!(user_error.is_err());
         wtxn.commit().unwrap();
 
         // Check that there is no document.
@@ -943,7 +944,8 @@ mod tests {
         let builder =
             IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
                 .unwrap();
-        assert!(builder.add_documents(content).is_err());
+        let (_builder, user_error) = builder.add_documents(content).unwrap();
+        assert!(user_error.is_err());
         wtxn.commit().unwrap();
 
         // First we send 1 document with a valid id.
diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
index 0ed1f1cc0..0cb0b4aff 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/validate.rs
@@ -19,20 +19,20 @@ pub fn validate_documents_batch<R: Read + Seek>(
     reader: DocumentsBatchReader<R>,
 ) -> Result<StdResult<DocumentsBatchReader<R>, UserError>> {
     let mut cursor = reader.into_cursor();
-    let documents_batch_index = cursor.documents_batch_index().clone();
+    let mut documents_batch_index = cursor.documents_batch_index().clone();
 
     // The primary key *field id* that has already been set for this index or the one
     // we will guess by searching for the first key that contains "id" as a substring.
     let (primary_key, primary_key_id) = match index.primary_key(rtxn)? {
         Some(primary_key) => match documents_batch_index.id(primary_key) {
             Some(id) => (primary_key, id),
+            None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)),
             None => {
                 return match cursor.next_document()? {
                     Some(first_document) => Ok(Err(UserError::MissingDocumentId {
                         primary_key: primary_key.to_string(),
                         document: obkv_to_object(&first_document, &documents_batch_index)?,
                     })),
-                    // If there is no document in this batch the best we can do is to return this error.
                     None => Ok(Err(UserError::MissingPrimaryKey)),
                 };
             }
@@ -40,10 +40,11 @@ pub fn validate_documents_batch<R: Read + Seek>(
         None => {
             let guessed = documents_batch_index
                 .iter()
-                .filter(|(_, name)| name.contains("id"))
+                .filter(|(_, name)| name.to_lowercase().contains("id"))
                 .min_by_key(|(fid, _)| *fid);
             match guessed {
                 Some((id, name)) => (name.as_str(), *id),
+                None if autogenerate_docids => ("id", documents_batch_index.insert("id")),
                 None => return Ok(Err(UserError::MissingPrimaryKey)),
             }
         }
@@ -56,12 +57,16 @@ pub fn validate_documents_batch<R: Read + Seek>(
         _otherwise => None,
     };
 
+    let mut count = 0;
     while let Some(document) = cursor.next_document()? {
         let document_id = match document.get(primary_key_id) {
             Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? {
                 Ok(document_id) => document_id,
                 Err(user_error) => return Ok(Err(user_error)),
             },
+            None if autogenerate_docids => {
+                format!("{{auto-generated id of the {}nth document}}", count)
+            }
             None => {
                 return Ok(Err(UserError::MissingDocumentId {
                     primary_key: primary_key.to_string(),
@@ -75,6 +80,7 @@ pub fn validate_documents_batch<R: Read + Seek>(
                 return Ok(Err(UserError::from(user_error)));
             }
         }
+        count += 1;
     }
 
     Ok(Ok(cursor.into_reader()))

From 19eb3b4708c1cd2619d2a77cefbcda7ff5e4bda5 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 15 Jun 2022 16:06:52 +0200
Subject: [PATCH 15/35] Make sur that we do not accept floats as documents ids

---
 milli/src/update/index_documents/mod.rs      | 47 ++++++++++++++++++++
 milli/src/update/index_documents/validate.rs |  2 +-
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 1a1fc9a0e..5bce3b851 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1987,4 +1987,51 @@ mod tests {
 
         assert_eq!(ids.len(), map.len());
     }
+
+    #[test]
+    fn primary_key_must_not_contain_floats() {
+        let tmp = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(4096 * 100);
+        let index = Index::new(options, tmp).unwrap();
+        let mut wtxn = index.write_txn().unwrap();
+        let indexer_config = IndexerConfig::default();
+        let builder = IndexDocuments::new(
+            &mut wtxn,
+            &index,
+            &indexer_config,
+            IndexDocumentsConfig::default(),
+            |_| (),
+        )
+        .unwrap();
+
+        let doc1 = documents! {[{
+            "id": -228142,
+            "title": "asdsad",
+        }]};
+
+        let doc2 = documents! {[{
+            "id": 228143.56,
+            "title": "something",
+        }]};
+
+        let doc3 = documents! {[{
+            "id": -228143.56,
+            "title": "something",
+        }]};
+
+        let doc4 = documents! {[{
+            "id": 2.0,
+            "title": "something",
+        }]};
+
+        let (builder, user_error) = builder.add_documents(doc1).unwrap();
+        user_error.unwrap();
+        let (builder, user_error) = builder.add_documents(doc2).unwrap();
+        assert!(user_error.is_err());
+        let (builder, user_error) = builder.add_documents(doc3).unwrap();
+        assert!(user_error.is_err());
+        let (_builder, user_error) = builder.add_documents(doc4).unwrap();
+        assert!(user_error.is_err());
+    }
 }
diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
index 0cb0b4aff..c69c754ac 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/validate.rs
@@ -108,7 +108,7 @@ pub fn validate_document_id_from_json(bytes: &[u8]) -> Result<StdResult<String,
                 return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) }))
             }
         },
-        Value::Number(number) => Ok(Ok(number.to_string())),
+        Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
         content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })),
     }
 }

From 8ebf5eed0d80beb623a056d4e70a5d0535cd7181 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 15 Jun 2022 17:58:52 +0200
Subject: [PATCH 16/35] Make the nested primary key work

---
 milli/src/error.rs                            |   2 +
 milli/src/update/index_documents/mod.rs       |   2 +-
 milli/src/update/index_documents/transform.rs |   5 +-
 milli/src/update/index_documents/validate.rs  | 228 ++++++++++++++----
 4 files changed, 191 insertions(+), 46 deletions(-)

diff --git a/milli/src/error.rs b/milli/src/error.rs
index a23472951..d05acbe1c 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -121,6 +121,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
     MaxDatabaseSizeReached,
     #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
     MissingDocumentId { primary_key: String, document: Object },
+    #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())]
+    TooManyDocumentIds { primary_key: String, document: Object },
     #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")]
     MissingPrimaryKey,
     #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")]
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 5bce3b851..ba1064684 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -29,7 +29,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
 use self::validate::validate_documents_batch;
 pub use self::validate::{
-    extract_float_from_value, validate_document_id, validate_document_id_from_json,
+    extract_float_from_value, validate_document_id, validate_document_id_value,
     validate_geo_from_json,
 };
 use crate::documents::{obkv_to_object, DocumentsBatchReader};
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 4ece58509..38f6dc8ff 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::db_name;
-use crate::update::index_documents::validate_document_id_from_json;
+use crate::update::index_documents::validate_document_id_value;
 use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use crate::{
     ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
@@ -806,7 +806,8 @@ fn update_primary_key<'a>(
 ) -> Result<Cow<'a, str>> {
     match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
         Some((_, bytes)) => {
-            let value = validate_document_id_from_json(bytes)??;
+            let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?;
+            let value = validate_document_id_value(document_id)??;
             serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?;
             Ok(Cow::Owned(value))
         }
diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
index c69c754ac..32e8de03f 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/validate.rs
@@ -1,11 +1,16 @@
 use std::io::{Read, Seek};
+use std::iter;
 use std::result::Result as StdResult;
 
 use serde_json::Value;
 
+use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
 use crate::error::{GeoError, InternalError, UserError};
-use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader};
-use crate::{Index, Result};
+use crate::update::index_documents::obkv_to_object;
+use crate::{FieldId, Index, Object, Result};
+
+/// The symbol used to define levels in a nested primary key.
+const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
 
 /// This function validates a documents by checking that:
 ///  - we can infer a primary key,
@@ -23,10 +28,15 @@ pub fn validate_documents_batch<R: Read + Seek>(
 
     // The primary key *field id* that has already been set for this index or the one
     // we will guess by searching for the first key that contains "id" as a substring.
-    let (primary_key, primary_key_id) = match index.primary_key(rtxn)? {
+    let primary_key = match index.primary_key(rtxn)? {
+        Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => {
+            PrimaryKey::nested(primary_key)
+        }
         Some(primary_key) => match documents_batch_index.id(primary_key) {
-            Some(id) => (primary_key, id),
-            None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)),
+            Some(id) => PrimaryKey::flat(primary_key, id),
+            None if autogenerate_docids => {
+                PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key))
+            }
             None => {
                 return match cursor.next_document()? {
                     Some(first_document) => Ok(Err(UserError::MissingDocumentId {
@@ -43,8 +53,10 @@ pub fn validate_documents_batch<R: Read + Seek>(
                 .filter(|(_, name)| name.to_lowercase().contains("id"))
                 .min_by_key(|(fid, _)| *fid);
             match guessed {
-                Some((id, name)) => (name.as_str(), *id),
-                None if autogenerate_docids => ("id", documents_batch_index.insert("id")),
+                Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
+                None if autogenerate_docids => {
+                    PrimaryKey::flat("id", documents_batch_index.insert("id"))
+                }
                 None => return Ok(Err(UserError::MissingPrimaryKey)),
             }
         }
@@ -59,20 +71,15 @@ pub fn validate_documents_batch<R: Read + Seek>(
 
     let mut count = 0;
     while let Some(document) = cursor.next_document()? {
-        let document_id = match document.get(primary_key_id) {
-            Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? {
-                Ok(document_id) => document_id,
-                Err(user_error) => return Ok(Err(user_error)),
-            },
-            None if autogenerate_docids => {
-                format!("{{auto-generated id of the {}nth document}}", count)
-            }
-            None => {
-                return Ok(Err(UserError::MissingDocumentId {
-                    primary_key: primary_key.to_string(),
-                    document: obkv_to_object(&document, &documents_batch_index)?,
-                }))
-            }
+        let document_id = match fetch_document_id(
+            &document,
+            &documents_batch_index,
+            primary_key,
+            autogenerate_docids,
+            count,
+        )? {
+            Ok(document_id) => document_id,
+            Err(user_error) => return Ok(Err(user_error)),
         };
 
         if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
@@ -86,30 +93,167 @@ pub fn validate_documents_batch<R: Read + Seek>(
     Ok(Ok(cursor.into_reader()))
 }
 
+/// Retrieve the document id after validating it, returning a `UserError`
+/// if the id is invalid or can't be guessed.
+fn fetch_document_id(
+    document: &obkv::KvReader<FieldId>,
+    documents_batch_index: &DocumentsBatchIndex,
+    primary_key: PrimaryKey,
+    autogenerate_docids: bool,
+    count: usize,
+) -> Result<StdResult<String, UserError>> {
+    match primary_key {
+        PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
+            match document.get(primary_key_id) {
+                Some(document_id_bytes) => {
+                    let document_id = serde_json::from_slice(document_id_bytes)
+                        .map_err(InternalError::SerdeJson)?;
+                    match validate_document_id_value(document_id)? {
+                        Ok(document_id) => Ok(Ok(document_id)),
+                        Err(user_error) => Ok(Err(user_error)),
+                    }
+                }
+                None if autogenerate_docids => {
+                    Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count)))
+                }
+                None => Ok(Err(UserError::MissingDocumentId {
+                    primary_key: primary_key.to_string(),
+                    document: obkv_to_object(&document, &documents_batch_index)?,
+                })),
+            }
+        }
+        nested @ PrimaryKey::Nested { .. } => {
+            let mut matching_documents_ids = Vec::new();
+            for (first_level_name, right) in nested.possible_level_names() {
+                if let Some(field_id) = documents_batch_index.id(first_level_name) {
+                    if let Some(value_bytes) = document.get(field_id) {
+                        let object = serde_json::from_slice(value_bytes)
+                            .map_err(InternalError::SerdeJson)?;
+                        fetch_matching_values(object, right, &mut matching_documents_ids);
+
+                        if matching_documents_ids.len() >= 2 {
+                            return Ok(Err(UserError::TooManyDocumentIds {
+                                primary_key: nested.primary_key().to_string(),
+                                document: obkv_to_object(&document, &documents_batch_index)?,
+                            }));
+                        }
+                    }
+                }
+            }
+
+            match matching_documents_ids.pop() {
+                Some(document_id) => match validate_document_id_value(document_id)? {
+                    Ok(document_id) => Ok(Ok(document_id)),
+                    Err(user_error) => Ok(Err(user_error)),
+                },
+                None => Ok(Err(UserError::MissingDocumentId {
+                    primary_key: nested.primary_key().to_string(),
+                    document: obkv_to_object(&document, &documents_batch_index)?,
+                })),
+            }
+        }
+    }
+}
+
+/// A type that represent the type of primary key that has been set
+/// for this index, a classic flat one or a nested one.
+#[derive(Debug, Clone, Copy)]
+enum PrimaryKey<'a> {
+    Flat { name: &'a str, field_id: FieldId },
+    Nested { name: &'a str },
+}
+
+impl PrimaryKey<'_> {
+    fn flat(name: &str, field_id: FieldId) -> PrimaryKey {
+        PrimaryKey::Flat { name, field_id }
+    }
+
+    fn nested(name: &str) -> PrimaryKey {
+        PrimaryKey::Nested { name }
+    }
+
+    fn primary_key(&self) -> &str {
+        match self {
+            PrimaryKey::Flat { name, .. } => name,
+            PrimaryKey::Nested { name } => name,
+        }
+    }
+
+    /// Returns an `Iterator` that gives all the possible fields names the primary key
+    /// can have depending of the first level name and deepnes of the objects.
+    fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
+        let name = self.primary_key();
+        iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL))
+    }
+}
+
+fn contained_in(selector: &str, key: &str) -> bool {
+    selector.starts_with(key)
+        && selector[key.len()..]
+            .chars()
+            .next()
+            .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL)
+            .unwrap_or(true)
+}
+
+pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
+    match value {
+        Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output),
+        otherwise => output.push(otherwise),
+    }
+}
+
+pub fn fetch_matching_values_in_object(
+    object: Object,
+    selector: &str,
+    base_key: &str,
+    output: &mut Vec<Value>,
+) {
+    for (key, value) in object {
+        let base_key = if base_key.is_empty() {
+            key.to_string()
+        } else {
+            format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
+        };
+
+        // here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
+        // so we check the contained_in on both side.
+        let should_continue =
+            contained_in(selector, &base_key) || contained_in(&base_key, selector);
+
+        if should_continue {
+            match value {
+                Value::Object(object) => {
+                    fetch_matching_values_in_object(object, selector, &base_key, output)
+                }
+                value => output.push(value),
+            }
+        }
+    }
+}
+
 /// Returns a trimmed version of the document id or `None` if it is invalid.
 pub fn validate_document_id(document_id: &str) -> Option<&str> {
-    let id = document_id.trim();
-    if !id.is_empty()
-        && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
+    let document_id = document_id.trim();
+    if !document_id.is_empty()
+        && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_'))
     {
-        Some(id)
+        Some(document_id)
     } else {
         None
     }
 }
 
 /// Parses a Json encoded document id and validate it, returning a user error when it is one.
-pub fn validate_document_id_from_json(bytes: &[u8]) -> Result<StdResult<String, UserError>> {
-    match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
+pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String, UserError>> {
+    match document_id {
         Value::String(string) => match validate_document_id(&string) {
             Some(s) if s.len() == string.len() => Ok(Ok(string)),
             Some(s) => Ok(Ok(s.to_string())),
-            None => {
-                return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) }))
-            }
+            None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })),
         },
         Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())),
-        content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })),
+        content => Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })),
     }
 }
 
@@ -124,24 +268,22 @@ pub fn extract_float_from_value(value: Value) -> StdResult<f64, Value> {
 }
 
 pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
-    let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
+    match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
         Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
             (Some(lat), Some(lng)) => {
                 match (extract_float_from_value(lat), extract_float_from_value(lng)) {
-                    (Ok(_), Ok(_)) => Ok(()),
-                    (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }),
-                    (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }),
+                    (Ok(_), Ok(_)) => Ok(Ok(())),
+                    (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })),
+                    (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })),
                     (Err(lat), Err(lng)) => {
-                        Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })
+                        Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }))
                     }
                 }
             }
-            (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }),
-            (Some(_), None) => Err(GeoError::MissingLongitude { document_id }),
-            (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }),
+            (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })),
+            (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })),
+            (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })),
         },
-        value => Err(GeoError::NotAnObject { document_id, value }),
-    };
-
-    Ok(result)
+        value => Ok(Err(GeoError::NotAnObject { document_id, value })),
+    }
 }

From dc3f092d0782fd7b39bb29aa424caa04ada8de3c Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 16 Jun 2022 12:03:43 +0200
Subject: [PATCH 17/35] Do not leak an internal grenad Error

---
 milli/src/documents/mod.rs    |  2 +-
 milli/src/documents/reader.rs | 32 ++++++++++++++++++++++++++++++--
 milli/src/error.rs            |  7 +++++++
 3 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index ee3593bf8..66a05b7b6 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -7,7 +7,7 @@ use std::io;
 use bimap::BiHashMap;
 pub use builder::DocumentsBatchBuilder;
 use obkv::KvReader;
-pub use reader::{DocumentsBatchCursor, DocumentsBatchReader};
+pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
 
 use crate::error::{FieldIdMapMissingEntry, InternalError};
diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs
index 3dff999f5..720b403b9 100644
--- a/milli/src/documents/reader.rs
+++ b/milli/src/documents/reader.rs
@@ -1,5 +1,5 @@
 use std::convert::TryInto;
-use std::io;
+use std::{error, fmt, io};
 
 use obkv::KvReader;
 
@@ -79,7 +79,9 @@ impl<R> DocumentsBatchCursor<R> {
 impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
     /// Returns the next document, starting from the first one. Subsequent calls to
     /// `next_document` advance the document reader until all the documents have been read.
-    pub fn next_document(&mut self) -> Result<Option<KvReader<FieldId>>, grenad::Error> {
+    pub fn next_document(
+        &mut self,
+    ) -> Result<Option<KvReader<FieldId>>, DocumentsBatchCursorError> {
         match self.cursor.move_on_next()? {
             Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => {
                 Ok(Some(KvReader::new(value)))
@@ -88,3 +90,29 @@ impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
         }
     }
 }
+
+/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents.
+#[derive(Debug)]
+pub struct DocumentsBatchCursorError {
+    inner: grenad::Error,
+}
+
+impl From<grenad::Error> for DocumentsBatchCursorError {
+    fn from(error: grenad::Error) -> DocumentsBatchCursorError {
+        DocumentsBatchCursorError { inner: error }
+    }
+}
+
+impl Into<grenad::Error> for DocumentsBatchCursorError {
+    fn into(self) -> grenad::Error {
+        self.inner
+    }
+}
+
+impl error::Error for DocumentsBatchCursorError {}
+
+impl fmt::Display for DocumentsBatchCursorError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        self.inner.fmt(f)
+    }
+}
diff --git a/milli/src/error.rs b/milli/src/error.rs
index d05acbe1c..d9dca287d 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -7,6 +7,7 @@ use rayon::ThreadPoolBuildError;
 use serde_json::Value;
 use thiserror::Error;
 
+use crate::documents::DocumentsBatchCursorError;
 use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
 
 pub fn is_reserved_keyword(keyword: &str) -> bool {
@@ -209,6 +210,12 @@ where
     }
 }
 
+impl From<DocumentsBatchCursorError> for Error {
+    fn from(error: DocumentsBatchCursorError) -> Error {
+        Error::from(Into::<grenad::Error>::into(error))
+    }
+}
+
 impl From<Infallible> for Error {
     fn from(_error: Infallible) -> Error {
         unreachable!()

From ea852200bbb9d9520ab7e13e5f6019c3b7c7fed3 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Wed, 22 Jun 2022 10:28:49 +0200
Subject: [PATCH 18/35] Fix the format used for a geo deleting benchmark

---
 benchmarks/benches/indexing.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index 1b501b21a..80c7ba0ed 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -1137,7 +1137,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) {
                 let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
-                let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json");
+                let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
                 let (builder, user_error) = builder.add_documents(documents).unwrap();
                 user_error.unwrap();
                 builder.execute().unwrap();

From 6a0a0ae94f38ee04ffed9bd252444128163b1550 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 20 Jun 2022 13:48:02 +0200
Subject: [PATCH 19/35] Make the Transform read from an
 EnrichedDocumentsBatchReader

---
 milli/src/documents/enriched.rs               | 103 ++++++++++++++++++
 milli/src/documents/mod.rs                    |  13 +++
 milli/src/documents/reader.rs                 |  20 ++--
 milli/src/error.rs                            |  10 +-
 milli/src/update/index_documents/mod.rs       |   6 +-
 milli/src/update/index_documents/transform.rs |   8 +-
 milli/src/update/index_documents/validate.rs  |  22 ++--
 7 files changed, 158 insertions(+), 24 deletions(-)
 create mode 100644 milli/src/documents/enriched.rs

diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs
new file mode 100644
index 000000000..8645e06c4
--- /dev/null
+++ b/milli/src/documents/enriched.rs
@@ -0,0 +1,103 @@
+use std::fs::File;
+use std::{io, str};
+
+use obkv::KvReader;
+
+use super::{
+    DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader,
+    Error,
+};
+use crate::FieldId;
+
+/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have
+/// been created with a `DocumentsBatchWriter` and, for the enriched data,
+/// a simple `grenad::Reader<File>`.
+///
+/// The documents are returned in the form of `obkv::Reader` where each field is identified with a
+/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
+pub struct EnrichedDocumentsBatchReader<R> {
+    documents: DocumentsBatchReader<R>,
+    external_ids: grenad::ReaderCursor<File>,
+}
+
+impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
+    pub fn new(
+        documents: DocumentsBatchReader<R>,
+        external_ids: grenad::Reader<File>,
+    ) -> Result<Self, Error> {
+        if documents.documents_count() as u64 == external_ids.len() {
+            Ok(EnrichedDocumentsBatchReader {
+                documents,
+                external_ids: external_ids.into_cursor()?,
+            })
+        } else {
+            Err(Error::InvalidEnrichedData)
+        }
+    }
+
+    pub fn documents_count(&self) -> u32 {
+        self.documents.documents_count()
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.documents.is_empty()
+    }
+
+    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
+        self.documents.documents_batch_index()
+    }
+
+    /// This method returns a forward cursor over the enriched documents.
+    pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor<R> {
+        let EnrichedDocumentsBatchReader { documents, mut external_ids } = self;
+        external_ids.reset();
+        EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids }
+    }
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct EnrichedDocument<'a> {
+    pub document: KvReader<'a, FieldId>,
+    pub external_id: &'a str,
+}
+
+pub struct EnrichedDocumentsBatchCursor<R> {
+    documents: DocumentsBatchCursor<R>,
+    external_ids: grenad::ReaderCursor<File>,
+}
+
+impl<R> EnrichedDocumentsBatchCursor<R> {
+    pub fn into_reader(self) -> EnrichedDocumentsBatchReader<R> {
+        let EnrichedDocumentsBatchCursor { documents, external_ids } = self;
+        EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids }
+    }
+
+    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
+        self.documents.documents_batch_index()
+    }
+
+    /// Resets the cursor to be able to read from the start again.
+    pub fn reset(&mut self) {
+        self.documents.reset();
+        self.external_ids.reset();
+    }
+}
+
+impl<R: io::Read + io::Seek> EnrichedDocumentsBatchCursor<R> {
+    /// Returns the next document, starting from the first one. Subsequent calls to
+    /// `next_document` advance the document reader until all the documents have been read.
+    pub fn next_enriched_document(
+        &mut self,
+    ) -> Result<Option<EnrichedDocument>, DocumentsBatchCursorError> {
+        let document = self.documents.next_document()?;
+        let external_id = match self.external_ids.move_on_next()? {
+            Some((_, bytes)) => Some(str::from_utf8(bytes)?),
+            None => None,
+        };
+
+        match document.zip(external_id) {
+            Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })),
+            None => Ok(None),
+        }
+    }
+}
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 66a05b7b6..43bfc1c20 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -1,11 +1,14 @@
 mod builder;
+mod enriched;
 mod reader;
 
 use std::fmt::{self, Debug};
 use std::io;
+use std::str::Utf8Error;
 
 use bimap::BiHashMap;
 pub use builder::DocumentsBatchBuilder;
+pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader};
 use obkv::KvReader;
 pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
@@ -87,6 +90,8 @@ impl DocumentsBatchIndex {
 pub enum Error {
     ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
     InvalidDocumentFormat,
+    InvalidEnrichedData,
+    InvalidUtf8(Utf8Error),
     Csv(csv::Error),
     Json(serde_json::Error),
     Serialize(serde_json::Error),
@@ -118,6 +123,12 @@ impl From<grenad::Error> for Error {
     }
 }
 
+impl From<Utf8Error> for Error {
+    fn from(other: Utf8Error) -> Self {
+        Self::InvalidUtf8(other)
+    }
+}
+
 impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
@@ -127,6 +138,8 @@ impl fmt::Display for Error {
             Error::InvalidDocumentFormat => {
                 f.write_str("Invalid document addition format, missing the documents batch index.")
             }
+            Error::InvalidEnrichedData => f.write_str("Invalid enriched data."),
+            Error::InvalidUtf8(e) => write!(f, "{}", e),
             Error::Io(e) => write!(f, "{}", e),
             Error::Serialize(e) => write!(f, "{}", e),
             Error::Grenad(e) => write!(f, "{}", e),
diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs
index 720b403b9..7bd6dbd51 100644
--- a/milli/src/documents/reader.rs
+++ b/milli/src/documents/reader.rs
@@ -1,5 +1,5 @@
 use std::convert::TryInto;
-use std::{error, fmt, io};
+use std::{error, fmt, io, str};
 
 use obkv::KvReader;
 
@@ -93,19 +93,20 @@ impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
 
 /// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents.
 #[derive(Debug)]
-pub struct DocumentsBatchCursorError {
-    inner: grenad::Error,
+pub enum DocumentsBatchCursorError {
+    Grenad(grenad::Error),
+    Utf8(str::Utf8Error),
 }
 
 impl From<grenad::Error> for DocumentsBatchCursorError {
     fn from(error: grenad::Error) -> DocumentsBatchCursorError {
-        DocumentsBatchCursorError { inner: error }
+        DocumentsBatchCursorError::Grenad(error)
     }
 }
 
-impl Into<grenad::Error> for DocumentsBatchCursorError {
-    fn into(self) -> grenad::Error {
-        self.inner
+impl From<str::Utf8Error> for DocumentsBatchCursorError {
+    fn from(error: str::Utf8Error) -> DocumentsBatchCursorError {
+        DocumentsBatchCursorError::Utf8(error)
     }
 }
 
@@ -113,6 +114,9 @@ impl error::Error for DocumentsBatchCursorError {}
 
 impl fmt::Display for DocumentsBatchCursorError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
-        self.inner.fmt(f)
+        match self {
+            DocumentsBatchCursorError::Grenad(e) => e.fmt(f),
+            DocumentsBatchCursorError::Utf8(e) => e.fmt(f),
+        }
     }
 }
diff --git a/milli/src/error.rs b/milli/src/error.rs
index d9dca287d..0419ceeda 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -7,7 +7,7 @@ use rayon::ThreadPoolBuildError;
 use serde_json::Value;
 use thiserror::Error;
 
-use crate::documents::DocumentsBatchCursorError;
+use crate::documents::{self, DocumentsBatchCursorError};
 use crate::{CriterionError, DocumentId, FieldId, Object, SortError};
 
 pub fn is_reserved_keyword(keyword: &str) -> bool {
@@ -36,6 +36,8 @@ pub enum InternalError {
     FieldIdMappingMissingEntry { key: FieldId },
     #[error(transparent)]
     Fst(#[from] fst::Error),
+    #[error(transparent)]
+    DocumentsError(#[from] documents::Error),
     #[error("Invalid compression type have been specified to grenad.")]
     GrenadInvalidCompressionType,
     #[error("Invalid grenad file with an invalid version format.")]
@@ -185,6 +187,7 @@ macro_rules! error_from_sub_error {
 error_from_sub_error! {
     FieldIdMapMissingEntry => InternalError,
     fst::Error => InternalError,
+    documents::Error => InternalError,
     str::Utf8Error => InternalError,
     ThreadPoolBuildError => InternalError,
     SerializationError => InternalError,
@@ -212,7 +215,10 @@ where
 
 impl From<DocumentsBatchCursorError> for Error {
     fn from(error: DocumentsBatchCursorError) -> Error {
-        Error::from(Into::<grenad::Error>::into(error))
+        match error {
+            DocumentsBatchCursorError::Grenad(e) => Error::from(e),
+            DocumentsBatchCursorError::Utf8(e) => Error::from(e),
+        }
     }
 }
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index ba1064684..fe3bd1f8f 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -27,7 +27,7 @@ pub use self::helpers::{
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
-use self::validate::validate_documents_batch;
+use self::validate::validate_and_enrich_documents_batch;
 pub use self::validate::{
     extract_float_from_value, validate_document_id, validate_document_id_value,
     validate_geo_from_json,
@@ -141,7 +141,7 @@ where
         // We check for user errors in this validator and if there is one, we can return
         // the `IndexDocument` struct as it is valid to send more documents into it.
         // However, if there is an internal error we throw it away!
-        let reader = match validate_documents_batch(
+        let enriched_documents_reader = match validate_and_enrich_documents_batch(
             self.wtxn,
             self.index,
             self.config.autogenerate_docids,
@@ -155,7 +155,7 @@ where
             .transform
             .as_mut()
             .expect("Invalid document addition state")
-            .read_documents(reader, self.wtxn, &self.progress)?
+            .read_documents(enriched_documents_reader, self.wtxn, &self.progress)?
             as u64;
 
         self.added_documents += indexed_documents;
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 38f6dc8ff..4d0a4c311 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -14,7 +14,7 @@ use smartstring::SmartString;
 
 use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
 use super::{IndexDocumentsMethod, IndexerConfig};
-use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
+use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::db_name;
 use crate::update::index_documents::validate_document_id_value;
@@ -153,7 +153,7 @@ impl<'a, 'i> Transform<'a, 'i> {
 
     pub fn read_documents<R, F>(
         &mut self,
-        reader: DocumentsBatchReader<R>,
+        reader: EnrichedDocumentsBatchReader<R>,
         wtxn: &mut heed::RwTxn,
         progress_callback: F,
     ) -> Result<usize>
@@ -189,7 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mut external_id_buffer = Vec::new();
         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
         let addition_index = cursor.documents_batch_index().clone();
-        while let Some(document) = cursor.next_document()? {
+        while let Some(enriched_document) = cursor.next_enriched_document()? {
+            let EnrichedDocument { document, external_id } = enriched_document;
+
             let mut field_buffer_cache = drop_and_reuse(field_buffer);
             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
                 progress_callback(UpdateIndexingStep::RemapDocumentAddition {
diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
index 32e8de03f..8b68532cb 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/validate.rs
@@ -4,27 +4,28 @@ use std::result::Result as StdResult;
 
 use serde_json::Value;
 
-use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader};
+use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
 use crate::error::{GeoError, InternalError, UserError};
-use crate::update::index_documents::obkv_to_object;
+use crate::update::index_documents::{obkv_to_object, writer_into_reader};
 use crate::{FieldId, Index, Object, Result};
 
 /// The symbol used to define levels in a nested primary key.
 const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
 
-/// This function validates a documents by checking that:
+/// This function validates and enrich the documents by checking that:
 ///  - we can infer a primary key,
-///  - all the documents id exist and,
+///  - all the documents id exist and are extracted,
 ///  - the validity of them but also,
 ///  - the validity of the `_geo` field depending on the settings.
-pub fn validate_documents_batch<R: Read + Seek>(
+pub fn validate_and_enrich_documents_batch<R: Read + Seek>(
     rtxn: &heed::RoTxn,
     index: &Index,
     autogenerate_docids: bool,
     reader: DocumentsBatchReader<R>,
-) -> Result<StdResult<DocumentsBatchReader<R>, UserError>> {
+) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
     let mut cursor = reader.into_cursor();
     let mut documents_batch_index = cursor.documents_batch_index().clone();
+    let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
 
     // The primary key *field id* that has already been set for this index or the one
     // we will guess by searching for the first key that contains "id" as a substring.
@@ -82,6 +83,8 @@ pub fn validate_documents_batch<R: Read + Seek>(
             Err(user_error) => return Ok(Err(user_error)),
         };
 
+        external_ids.insert(count.to_be_bytes(), &document_id)?;
+
         if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
             if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? {
                 return Ok(Err(UserError::from(user_error)));
@@ -90,7 +93,10 @@ pub fn validate_documents_batch<R: Read + Seek>(
         count += 1;
     }
 
-    Ok(Ok(cursor.into_reader()))
+    let external_ids = writer_into_reader(external_ids)?;
+    let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?;
+
+    Ok(Ok(reader))
 }
 
 /// Retrieve the document id after validating it, returning a `UserError`
@@ -100,7 +106,7 @@ fn fetch_document_id(
     documents_batch_index: &DocumentsBatchIndex,
     primary_key: PrimaryKey,
     autogenerate_docids: bool,
-    count: usize,
+    count: u32,
 ) -> Result<StdResult<String, UserError>> {
     match primary_key {
         PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {

From 5f1bfb73eeeb1da17a1d2598d01916b08022aa8c Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 21 Jun 2022 10:45:25 +0200
Subject: [PATCH 20/35] Extract the primary key name and make it accessible

---
 milli/src/documents/enriched.rs              | 28 +++++++++++++++++---
 milli/src/update/index_documents/validate.rs |  6 ++++-
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs
index 8645e06c4..918b47c95 100644
--- a/milli/src/documents/enriched.rs
+++ b/milli/src/documents/enriched.rs
@@ -17,17 +17,20 @@ use crate::FieldId;
 /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
 pub struct EnrichedDocumentsBatchReader<R> {
     documents: DocumentsBatchReader<R>,
+    primary_key: String,
     external_ids: grenad::ReaderCursor<File>,
 }
 
 impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
     pub fn new(
         documents: DocumentsBatchReader<R>,
+        primary_key: String,
         external_ids: grenad::Reader<File>,
     ) -> Result<Self, Error> {
         if documents.documents_count() as u64 == external_ids.len() {
             Ok(EnrichedDocumentsBatchReader {
                 documents,
+                primary_key,
                 external_ids: external_ids.into_cursor()?,
             })
         } else {
@@ -39,6 +42,10 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
         self.documents.documents_count()
     }
 
+    pub fn primary_key(&self) -> &str {
+        &self.primary_key
+    }
+
     pub fn is_empty(&self) -> bool {
         self.documents.is_empty()
     }
@@ -49,9 +56,13 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
 
     /// This method returns a forward cursor over the enriched documents.
     pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor<R> {
-        let EnrichedDocumentsBatchReader { documents, mut external_ids } = self;
+        let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self;
         external_ids.reset();
-        EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids }
+        EnrichedDocumentsBatchCursor {
+            documents: documents.into_cursor(),
+            primary_key,
+            external_ids,
+        }
     }
 }
 
@@ -63,13 +74,22 @@ pub struct EnrichedDocument<'a> {
 
 pub struct EnrichedDocumentsBatchCursor<R> {
     documents: DocumentsBatchCursor<R>,
+    primary_key: String,
     external_ids: grenad::ReaderCursor<File>,
 }
 
 impl<R> EnrichedDocumentsBatchCursor<R> {
     pub fn into_reader(self) -> EnrichedDocumentsBatchReader<R> {
-        let EnrichedDocumentsBatchCursor { documents, external_ids } = self;
-        EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids }
+        let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self;
+        EnrichedDocumentsBatchReader {
+            documents: documents.into_reader(),
+            primary_key,
+            external_ids,
+        }
+    }
+
+    pub fn primary_key(&self) -> &str {
+        &self.primary_key
     }
 
     pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
index 8b68532cb..83d7ef38e 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/validate.rs
@@ -94,7 +94,11 @@ pub fn validate_and_enrich_documents_batch<R: Read + Seek>(
     }
 
     let external_ids = writer_into_reader(external_ids)?;
-    let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?;
+    let reader = EnrichedDocumentsBatchReader::new(
+        cursor.into_reader(),
+        primary_key.primary_key().to_string(),
+        external_ids,
+    )?;
 
     Ok(Ok(reader))
 }

From 742543091e9b33084952ad27e00eff60655d7bf0 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 21 Jun 2022 10:48:07 +0200
Subject: [PATCH 21/35] Constify the default primary key name

---
 milli/src/update/index_documents/validate.rs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs
index 83d7ef38e..4e52f4cb9 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/validate.rs
@@ -12,6 +12,9 @@ use crate::{FieldId, Index, Object, Result};
 /// The symbol used to define levels in a nested primary key.
 const PRIMARY_KEY_SPLIT_SYMBOL: char = '.';
 
+/// The default primary that is used when not specified.
+const DEFAULT_PRIMARY_KEY: &str = "id";
+
 /// This function validates and enrich the documents by checking that:
 ///  - we can infer a primary key,
 ///  - all the documents id exist and are extracted,
@@ -51,13 +54,14 @@ pub fn validate_and_enrich_documents_batch<R: Read + Seek>(
         None => {
             let guessed = documents_batch_index
                 .iter()
-                .filter(|(_, name)| name.to_lowercase().contains("id"))
+                .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY))
                 .min_by_key(|(fid, _)| *fid);
             match guessed {
                 Some((id, name)) => PrimaryKey::flat(name.as_str(), *id),
-                None if autogenerate_docids => {
-                    PrimaryKey::flat("id", documents_batch_index.insert("id"))
-                }
+                None if autogenerate_docids => PrimaryKey::flat(
+                    DEFAULT_PRIMARY_KEY,
+                    documents_batch_index.insert(DEFAULT_PRIMARY_KEY),
+                ),
                 None => return Ok(Err(UserError::MissingPrimaryKey)),
             }
         }

From 905af2a2e9f69b2341d532b105bbf1b3baff819e Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 21 Jun 2022 11:12:51 +0200
Subject: [PATCH 22/35] Use the primary key and external id in the transform

---
 milli/src/update/index_documents/transform.rs | 290 +++++-------------
 1 file changed, 79 insertions(+), 211 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 4d0a4c311..e82556ec7 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -9,7 +9,7 @@ use heed::RoTxn;
 use itertools::Itertools;
 use obkv::{KvReader, KvWriter};
 use roaring::RoaringBitmap;
-use serde_json::{Map, Value};
+use serde_json::Value;
 use smartstring::SmartString;
 
 use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn};
@@ -17,15 +17,12 @@ use super::{IndexDocumentsMethod, IndexerConfig};
 use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader};
 use crate::error::{Error, InternalError, UserError};
 use crate::index::db_name;
-use crate::update::index_documents::validate_document_id_value;
 use crate::update::{AvailableDocumentsIds, UpdateIndexingStep};
 use crate::{
     ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index,
     Result, BEU32,
 };
 
-const DEFAULT_PRIMARY_KEY_NAME: &str = "id";
-
 pub struct TransformOutput {
     pub primary_key: String,
     pub fields_ids_map: FieldsIdsMap,
@@ -85,18 +82,6 @@ fn create_fields_mapping(
         .collect()
 }
 
-/// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields.
-/// It doesn't look in the subfield because we don't want to enable the
-/// primary key inference on nested objects.
-fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
-    index
-        .iter()
-        .sorted_by_key(|(k, _)| *k)
-        .map(|(_, v)| v)
-        .find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME))
-        .map(String::as_str)
-}
-
 impl<'a, 'i> Transform<'a, 'i> {
     pub fn new(
         wtxn: &mut heed::RwTxn,
@@ -167,28 +152,15 @@ impl<'a, 'i> Transform<'a, 'i> {
 
         let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
 
-        let alternative_name = self
-            .index
-            .primary_key(wtxn)?
-            .or_else(|| find_primary_key(fields_index))
-            .map(String::from);
-
-        let (primary_key_id, primary_key_name) = compute_primary_key_pair(
-            self.index.primary_key(wtxn)?,
-            &mut self.fields_ids_map,
-            alternative_name,
-            self.autogenerate_docids,
-        )?;
-
-        let primary_key_id_nested = primary_key_name.contains('.');
+        let primary_key = cursor.primary_key().to_string();
+        self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
+        let primary_key_id_nested = primary_key.contains('.');
 
         let mut flattened_document = None;
         let mut obkv_buffer = Vec::new();
         let mut flattened_obkv_buffer = Vec::new();
         let mut documents_count = 0;
-        let mut external_id_buffer = Vec::new();
         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
-        let addition_index = cursor.documents_batch_index().clone();
         while let Some(enriched_document) = cursor.next_enriched_document()? {
             let EnrichedDocument { document, external_id } = enriched_document;
 
@@ -210,8 +182,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             // it, transform it into a string and validate it, and then update it in the
             // document. If none is found, and we were told to generate missing document ids, then
             // we create the missing field, and update the new document.
-            let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
-            let external_id = if primary_key_id_nested {
+            if primary_key_id_nested {
                 let mut field_buffer_cache = field_buffer_cache.clone();
                 self.flatten_from_field_mapping(
                     &mapping,
@@ -220,29 +191,6 @@ impl<'a, 'i> Transform<'a, 'i> {
                     &mut field_buffer_cache,
                 )?;
                 flattened_document = Some(&flattened_obkv_buffer);
-                let document = KvReader::new(&flattened_obkv_buffer);
-
-                update_primary_key(
-                    document,
-                    &addition_index,
-                    primary_key_id,
-                    &primary_key_name,
-                    &mut uuid_buffer,
-                    &mut field_buffer_cache,
-                    &mut external_id_buffer,
-                    self.autogenerate_docids,
-                )?
-            } else {
-                update_primary_key(
-                    document,
-                    &addition_index,
-                    primary_key_id,
-                    &primary_key_name,
-                    &mut uuid_buffer,
-                    &mut field_buffer_cache,
-                    &mut external_id_buffer,
-                    self.autogenerate_docids,
-                )?
             };
 
             // Insertion in a obkv need to be done with keys ordered. For now they are ordered
@@ -318,7 +266,6 @@ impl<'a, 'i> Transform<'a, 'i> {
             });
 
             field_buffer = drop_and_reuse(field_buffer_cache);
-            external_id_buffer.clear();
             obkv_buffer.clear();
         }
 
@@ -327,7 +274,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         });
 
         self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?;
-        self.index.put_primary_key(wtxn, &primary_key_name)?;
+        self.index.put_primary_key(wtxn, &primary_key)?;
         self.documents_count += documents_count;
         // Now that we have a valid sorter that contains the user id and the obkv we
         // give it to the last transforming function which returns the TransformOutput.
@@ -749,42 +696,6 @@ impl<'a, 'i> Transform<'a, 'i> {
     }
 }
 
-/// Given an optional primary key and an optional alternative name, returns the (field_id, attr_name)
-/// for the primary key according to the following rules:
-/// - if primary_key is `Some`, returns the id and the name, else
-/// - if alternative_name is Some, adds alternative to the fields_ids_map, and returns the pair, else
-/// - if autogenerate_docids is true, insert the default id value in the field ids map ("id") and
-/// returns the pair, else
-/// - returns an error.
-fn compute_primary_key_pair(
-    primary_key: Option<&str>,
-    fields_ids_map: &mut FieldsIdsMap,
-    alternative_name: Option<String>,
-    autogenerate_docids: bool,
-) -> Result<(FieldId, String)> {
-    match primary_key {
-        Some(primary_key) => {
-            let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?;
-            Ok((id, primary_key.to_string()))
-        }
-        None => {
-            let name = match alternative_name {
-                Some(key) => key,
-                None => {
-                    if !autogenerate_docids {
-                        // If there is no primary key in the current document batch, we must
-                        // return an error and not automatically generate any document id.
-                        return Err(UserError::MissingPrimaryKey.into());
-                    }
-                    DEFAULT_PRIMARY_KEY_NAME.to_string()
-                }
-            };
-            let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?;
-            Ok((id, name))
-        }
-    }
-}
-
 /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec<T>`.
 ///
 /// The size and alignment of T and U must match.
@@ -796,49 +707,6 @@ fn drop_and_reuse<U, T>(mut vec: Vec<U>) -> Vec<T> {
     vec.into_iter().map(|_| unreachable!()).collect()
 }
 
-fn update_primary_key<'a>(
-    document: KvReader<'a, FieldId>,
-    addition_index: &DocumentsBatchIndex,
-    primary_key_id: FieldId,
-    primary_key_name: &str,
-    uuid_buffer: &'a mut [u8; uuid::fmt::Hyphenated::LENGTH],
-    field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>,
-    mut external_id_buffer: &'a mut Vec<u8>,
-    autogenerate_docids: bool,
-) -> Result<Cow<'a, str>> {
-    match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) {
-        Some((_, bytes)) => {
-            let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?;
-            let value = validate_document_id_value(document_id)??;
-            serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?;
-            Ok(Cow::Owned(value))
-        }
-        None if autogenerate_docids => {
-            let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
-            serde_json::to_writer(&mut external_id_buffer, &uuid)
-                .map_err(InternalError::SerdeJson)?;
-            field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into()));
-            Ok(Cow::Borrowed(&*uuid))
-        }
-        None => {
-            let mut json = Map::new();
-            for (key, value) in document.iter() {
-                let key = addition_index.name(key).map(ToString::to_string);
-                let value = serde_json::from_slice::<Value>(&value).ok();
-
-                if let Some((k, v)) = key.zip(value) {
-                    json.insert(k, v);
-                }
-            }
-
-            Err(UserError::MissingDocumentId {
-                primary_key: primary_key_name.to_string(),
-                document: json,
-            })?
-        }
-    }
-}
-
 impl TransformOutput {
     // find and insert the new field ids
     pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result<HashSet<String>> {
@@ -853,87 +721,87 @@ impl TransformOutput {
     }
 }
 
-#[cfg(test)]
-mod test {
-    use super::*;
+// #[cfg(test)]
+// mod test {
+//     use super::*;
 
-    mod compute_primary_key {
-        use big_s::S;
+//     mod compute_primary_key {
+//         use big_s::S;
 
-        use super::{compute_primary_key_pair, FieldsIdsMap};
+//         use super::{compute_primary_key_pair, FieldsIdsMap};
 
-        #[test]
-        fn should_return_primary_key_if_is_some() {
-            let mut fields_map = FieldsIdsMap::new();
-            fields_map.insert("toto").unwrap();
-            let result = compute_primary_key_pair(
-                Some("toto"),
-                &mut fields_map,
-                Some("tata".to_string()),
-                false,
-            );
-            assert_eq!(result.unwrap(), (0, "toto".to_string()));
-            assert_eq!(fields_map.len(), 1);
+//         #[test]
+//         fn should_return_primary_key_if_is_some() {
+//             let mut fields_map = FieldsIdsMap::new();
+//             fields_map.insert("toto").unwrap();
+//             let result = compute_primary_key_pair(
+//                 Some("toto"),
+//                 &mut fields_map,
+//                 Some("tata".to_string()),
+//                 false,
+//             );
+//             assert_eq!(result.unwrap(), (0, "toto".to_string()));
+//             assert_eq!(fields_map.len(), 1);
 
-            // and with nested fields
-            let mut fields_map = FieldsIdsMap::new();
-            fields_map.insert("toto.tata").unwrap();
-            let result = compute_primary_key_pair(
-                Some("toto.tata"),
-                &mut fields_map,
-                Some(S("titi")),
-                false,
-            );
-            assert_eq!(result.unwrap(), (0, "toto.tata".to_string()));
-            assert_eq!(fields_map.len(), 1);
-        }
+//             // and with nested fields
+//             let mut fields_map = FieldsIdsMap::new();
+//             fields_map.insert("toto.tata").unwrap();
+//             let result = compute_primary_key_pair(
+//                 Some("toto.tata"),
+//                 &mut fields_map,
+//                 Some(S("titi")),
+//                 false,
+//             );
+//             assert_eq!(result.unwrap(), (0, "toto.tata".to_string()));
+//             assert_eq!(fields_map.len(), 1);
+//         }
 
-        #[test]
-        fn should_return_alternative_if_primary_is_none() {
-            let mut fields_map = FieldsIdsMap::new();
-            let result =
-                compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
-            assert_eq!(result.unwrap(), (0, S("tata")));
-            assert_eq!(fields_map.len(), 1);
-        }
+//         #[test]
+//         fn should_return_alternative_if_primary_is_none() {
+//             let mut fields_map = FieldsIdsMap::new();
+//             let result =
+//                 compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
+//             assert_eq!(result.unwrap(), (0, S("tata")));
+//             assert_eq!(fields_map.len(), 1);
+//         }
 
-        #[test]
-        fn should_return_default_if_both_are_none() {
-            let mut fields_map = FieldsIdsMap::new();
-            let result = compute_primary_key_pair(None, &mut fields_map, None, true);
-            assert_eq!(result.unwrap(), (0, S("id")));
-            assert_eq!(fields_map.len(), 1);
-        }
+//         #[test]
+//         fn should_return_default_if_both_are_none() {
+//             let mut fields_map = FieldsIdsMap::new();
+//             let result = compute_primary_key_pair(None, &mut fields_map, None, true);
+//             assert_eq!(result.unwrap(), (0, S("id")));
+//             assert_eq!(fields_map.len(), 1);
+//         }
 
-        #[test]
-        fn should_return_err_if_both_are_none_and_recompute_is_false() {
-            let mut fields_map = FieldsIdsMap::new();
-            let result = compute_primary_key_pair(None, &mut fields_map, None, false);
-            assert!(result.is_err());
-            assert_eq!(fields_map.len(), 0);
-        }
-    }
+//         #[test]
+//         fn should_return_err_if_both_are_none_and_recompute_is_false() {
+//             let mut fields_map = FieldsIdsMap::new();
+//             let result = compute_primary_key_pair(None, &mut fields_map, None, false);
+//             assert!(result.is_err());
+//             assert_eq!(fields_map.len(), 0);
+//         }
+//     }
 
-    mod primary_key_inference {
-        use big_s::S;
-        use bimap::BiHashMap;
+//     mod primary_key_inference {
+//         use big_s::S;
+//         use bimap::BiHashMap;
 
-        use crate::documents::DocumentsBatchIndex;
-        use crate::update::index_documents::transform::find_primary_key;
+//         use crate::documents::DocumentsBatchIndex;
+//         use crate::update::index_documents::transform::find_primary_key;
 
-        #[test]
-        fn primary_key_infered_on_first_field() {
-            // We run the test multiple times to change the order in which the fields are iterated upon.
-            for _ in 1..50 {
-                let mut map = BiHashMap::new();
-                map.insert(1, S("fakeId"));
-                map.insert(2, S("fakeId"));
-                map.insert(3, S("fakeId"));
-                map.insert(4, S("fakeId"));
-                map.insert(0, S("realId"));
+//         #[test]
+//         fn primary_key_infered_on_first_field() {
+//             // We run the test multiple times to change the order in which the fields are iterated upon.
+//             for _ in 1..50 {
+//                 let mut map = BiHashMap::new();
+//                 map.insert(1, S("fakeId"));
+//                 map.insert(2, S("fakeId"));
+//                 map.insert(3, S("fakeId"));
+//                 map.insert(4, S("fakeId"));
+//                 map.insert(0, S("realId"));
 
-                assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
-            }
-        }
-    }
-}
+//                 assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
+//             }
+//         }
+//     }
+// }

From c8ebf0de47e09964a9b4060b1da9982a593040e1 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 21 Jun 2022 11:14:14 +0200
Subject: [PATCH 23/35] Rename the validate function as an enriching function

---
 .../index_documents/{validate.rs => enrich.rs}     |  2 +-
 milli/src/update/index_documents/mod.rs            | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)
 rename milli/src/update/index_documents/{validate.rs => enrich.rs} (99%)

diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/enrich.rs
similarity index 99%
rename from milli/src/update/index_documents/validate.rs
rename to milli/src/update/index_documents/enrich.rs
index 4e52f4cb9..e3c3bd6f6 100644
--- a/milli/src/update/index_documents/validate.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -20,7 +20,7 @@ const DEFAULT_PRIMARY_KEY: &str = "id";
 ///  - all the documents id exist and are extracted,
 ///  - the validity of them but also,
 ///  - the validity of the `_geo` field depending on the settings.
-pub fn validate_and_enrich_documents_batch<R: Read + Seek>(
+pub fn enrich_documents_batch<R: Read + Seek>(
     rtxn: &heed::RoTxn,
     index: &Index,
     autogenerate_docids: bool,
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index fe3bd1f8f..db1a768e6 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1,8 +1,8 @@
+mod enrich;
 mod extract;
 mod helpers;
 mod transform;
 mod typed_chunk;
-mod validate;
 
 use std::collections::HashSet;
 use std::io::{Cursor, Read, Seek};
@@ -19,6 +19,11 @@ use serde::{Deserialize, Serialize};
 use slice_group_by::GroupBy;
 use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
 
+use self::enrich::enrich_documents_batch;
+pub use self::enrich::{
+    extract_float_from_value, validate_document_id, validate_document_id_value,
+    validate_geo_from_json,
+};
 pub use self::helpers::{
     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
     fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps,
@@ -27,11 +32,6 @@ pub use self::helpers::{
 };
 use self::helpers::{grenad_obkv_into_chunks, GrenadParameters};
 pub use self::transform::{Transform, TransformOutput};
-use self::validate::validate_and_enrich_documents_batch;
-pub use self::validate::{
-    extract_float_from_value, validate_document_id, validate_document_id_value,
-    validate_geo_from_json,
-};
 use crate::documents::{obkv_to_object, DocumentsBatchReader};
 use crate::error::UserError;
 pub use crate::update::index_documents::helpers::CursorClonableMmap;
@@ -141,7 +141,7 @@ where
         // We check for user errors in this validator and if there is one, we can return
         // the `IndexDocument` struct as it is valid to send more documents into it.
         // However, if there is an internal error we throw it away!
-        let enriched_documents_reader = match validate_and_enrich_documents_batch(
+        let enriched_documents_reader = match enrich_documents_batch(
             self.wtxn,
             self.index,
             self.config.autogenerate_docids,

From d1a4da98127207b7ce560280911480a4e0a2ced4 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 21 Jun 2022 11:16:59 +0200
Subject: [PATCH 24/35] Generate a real UUIDv4 when ids are auto-generated

---
 milli/src/update/index_documents/enrich.rs    | 93 ++++++++++++++-----
 .../extract/extract_geo_points.rs             | 14 +--
 2 files changed, 78 insertions(+), 29 deletions(-)

diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index e3c3bd6f6..5d00565a8 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -1,6 +1,6 @@
 use std::io::{Read, Seek};
-use std::iter;
 use std::result::Result as StdResult;
+use std::{fmt, iter};
 
 use serde_json::Value;
 
@@ -29,6 +29,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
     let mut cursor = reader.into_cursor();
     let mut documents_batch_index = cursor.documents_batch_index().clone();
     let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
+    let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
 
     // The primary key *field id* that has already been set for this index or the one
     // we will guess by searching for the first key that contains "id" as a substring.
@@ -76,31 +77,33 @@ pub fn enrich_documents_batch<R: Read + Seek>(
 
     let mut count = 0;
     while let Some(document) = cursor.next_document()? {
-        let document_id = match fetch_document_id(
+        let document_id = match fetch_or_generate_document_id(
             &document,
             &documents_batch_index,
             primary_key,
             autogenerate_docids,
+            &mut uuid_buffer,
             count,
         )? {
             Ok(document_id) => document_id,
             Err(user_error) => return Ok(Err(user_error)),
         };
 
-        external_ids.insert(count.to_be_bytes(), &document_id)?;
+        external_ids.insert(count.to_be_bytes(), document_id.value())?;
 
         if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
-            if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? {
+            if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? {
                 return Ok(Err(UserError::from(user_error)));
             }
         }
+
         count += 1;
     }
 
     let external_ids = writer_into_reader(external_ids)?;
     let reader = EnrichedDocumentsBatchReader::new(
         cursor.into_reader(),
-        primary_key.primary_key().to_string(),
+        primary_key.name().to_string(),
         external_ids,
     )?;
 
@@ -109,13 +112,14 @@ pub fn enrich_documents_batch<R: Read + Seek>(
 
 /// Retrieve the document id after validating it, returning a `UserError`
 /// if the id is invalid or can't be guessed.
-fn fetch_document_id(
+fn fetch_or_generate_document_id(
     document: &obkv::KvReader<FieldId>,
     documents_batch_index: &DocumentsBatchIndex,
     primary_key: PrimaryKey,
     autogenerate_docids: bool,
+    uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH],
     count: u32,
-) -> Result<StdResult<String, UserError>> {
+) -> Result<StdResult<DocumentId, UserError>> {
     match primary_key {
         PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => {
             match document.get(primary_key_id) {
@@ -123,12 +127,13 @@ fn fetch_document_id(
                     let document_id = serde_json::from_slice(document_id_bytes)
                         .map_err(InternalError::SerdeJson)?;
                     match validate_document_id_value(document_id)? {
-                        Ok(document_id) => Ok(Ok(document_id)),
+                        Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
                         Err(user_error) => Ok(Err(user_error)),
                     }
                 }
                 None if autogenerate_docids => {
-                    Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count)))
+                    let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer);
+                    Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
                 }
                 None => Ok(Err(UserError::MissingDocumentId {
                     primary_key: primary_key.to_string(),
@@ -147,7 +152,7 @@ fn fetch_document_id(
 
                         if matching_documents_ids.len() >= 2 {
                             return Ok(Err(UserError::TooManyDocumentIds {
-                                primary_key: nested.primary_key().to_string(),
+                                primary_key: nested.name().to_string(),
                                 document: obkv_to_object(&document, &documents_batch_index)?,
                             }));
                         }
@@ -157,11 +162,11 @@ fn fetch_document_id(
 
             match matching_documents_ids.pop() {
                 Some(document_id) => match validate_document_id_value(document_id)? {
-                    Ok(document_id) => Ok(Ok(document_id)),
+                    Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))),
                     Err(user_error) => Ok(Err(user_error)),
                 },
                 None => Ok(Err(UserError::MissingDocumentId {
-                    primary_key: nested.primary_key().to_string(),
+                    primary_key: nested.name().to_string(),
                     document: obkv_to_object(&document, &documents_batch_index)?,
                 })),
             }
@@ -186,7 +191,7 @@ impl PrimaryKey<'_> {
         PrimaryKey::Nested { name }
     }
 
-    fn primary_key(&self) -> &str {
+    fn name(&self) -> &str {
         match self {
             PrimaryKey::Flat { name, .. } => name,
             PrimaryKey::Nested { name } => name,
@@ -196,11 +201,53 @@ impl PrimaryKey<'_> {
     /// Returns an `Iterator` that gives all the possible fields names the primary key
     /// can have depending of the first level name and deepnes of the objects.
     fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
-        let name = self.primary_key();
+        let name = self.name();
         iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL))
     }
 }
 
+/// A type that represents a document id that has been retrieved from a document or auto-generated.
+///
+/// In case the document id has been auto-generated, the document nth is kept to help
+/// users debug if there is an issue with the document itself.
+#[derive(Clone)]
+pub enum DocumentId {
+    Retrieved { value: String },
+    Generated { value: String, document_nth: u32 },
+}
+
+impl DocumentId {
+    fn retrieved(value: String) -> DocumentId {
+        DocumentId::Retrieved { value }
+    }
+
+    fn generated(value: String, document_nth: u32) -> DocumentId {
+        DocumentId::Generated { value, document_nth }
+    }
+
+    fn value(&self) -> &str {
+        match self {
+            DocumentId::Retrieved { value } => value,
+            DocumentId::Generated { value, .. } => value,
+        }
+    }
+
+    fn debug(&self) -> String {
+        format!("{:?}", self)
+    }
+}
+
+impl fmt::Debug for DocumentId {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            DocumentId::Retrieved { value } => write!(f, "{:?}", value),
+            DocumentId::Generated { value, document_nth } => {
+                write!(f, "{{{:?}}} of the {}nth document", value, document_nth)
+            }
+        }
+    }
+}
+
 fn contained_in(selector: &str, key: &str) -> bool {
     selector.starts_with(key)
         && selector[key.len()..]
@@ -281,23 +328,25 @@ pub fn extract_float_from_value(value: Value) -> StdResult<f64, Value> {
     }
 }
 
-pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
+pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult<(), GeoError>> {
+    use GeoError::*;
+    let debug_id = || Value::from(id.debug());
     match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
         Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
             (Some(lat), Some(lng)) => {
                 match (extract_float_from_value(lat), extract_float_from_value(lng)) {
                     (Ok(_), Ok(_)) => Ok(Ok(())),
-                    (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })),
-                    (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })),
+                    (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
+                    (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
                     (Err(lat), Err(lng)) => {
-                        Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }))
+                        Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng }))
                     }
                 }
             }
-            (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })),
-            (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })),
-            (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })),
+            (None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })),
+            (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })),
+            (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })),
         },
-        value => Ok(Err(GeoError::NotAnObject { document_id, value })),
+        value => Ok(Err(NotAnObject { document_id: debug_id(), value })),
     }
 }
diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index 46ef9ba9b..5a6de236b 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -29,9 +29,9 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
         let obkv = obkv::KvReader::new(value);
         // since we only needs the primary key when we throw an error we create this getter to
         // lazily get it when needed
-        let primary_key = || -> Value {
-            let primary_key = obkv.get(primary_key_id).unwrap();
-            serde_json::from_slice(primary_key).unwrap()
+        let document_id = || -> Value {
+            let document_id = obkv.get(primary_key_id).unwrap();
+            serde_json::from_slice(document_id).unwrap()
         };
 
         // first we get the two fields
@@ -43,19 +43,19 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
             let lat = extract_float_from_value(
                 serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
             )
-            .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?;
+            .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
 
             let lng = extract_float_from_value(
                 serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
             )
-            .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?;
+            .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
 
             let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()];
             writer.insert(docid_bytes, bytes)?;
         } else if lat.is_none() && lng.is_some() {
-            return Err(GeoError::MissingLatitude { document_id: primary_key() })?;
+            return Err(GeoError::MissingLatitude { document_id: document_id() })?;
         } else if lat.is_some() && lng.is_none() {
-            return Err(GeoError::MissingLongitude { document_id: primary_key() })?;
+            return Err(GeoError::MissingLongitude { document_id: document_id() })?;
         }
     }
 

From 0bbcc7b1808061355202efe040c38f64de64ba50 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 21 Jun 2022 14:41:19 +0200
Subject: [PATCH 25/35] Expose the `DocumentId` struct to be sure to inject the
 generated ids

---
 milli/src/documents/enriched.rs               | 13 ++++++-----
 milli/src/documents/reader.rs                 | 12 +++++-----
 milli/src/error.rs                            |  2 +-
 milli/src/update/index_documents/enrich.rs    | 22 ++++++++++++-------
 milli/src/update/index_documents/mod.rs       |  2 +-
 milli/src/update/index_documents/transform.rs | 13 +++++++++--
 milli/src/update/mod.rs                       |  2 +-
 7 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs
index 918b47c95..4f45a891a 100644
--- a/milli/src/documents/enriched.rs
+++ b/milli/src/documents/enriched.rs
@@ -7,6 +7,7 @@ use super::{
     DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader,
     Error,
 };
+use crate::update::DocumentId;
 use crate::FieldId;
 
 /// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have
@@ -66,10 +67,10 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
     }
 }
 
-#[derive(Debug, Clone, Copy)]
+#[derive(Debug, Clone)]
 pub struct EnrichedDocument<'a> {
     pub document: KvReader<'a, FieldId>,
-    pub external_id: &'a str,
+    pub document_id: DocumentId,
 }
 
 pub struct EnrichedDocumentsBatchCursor<R> {
@@ -110,13 +111,13 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchCursor<R> {
         &mut self,
     ) -> Result<Option<EnrichedDocument>, DocumentsBatchCursorError> {
         let document = self.documents.next_document()?;
-        let external_id = match self.external_ids.move_on_next()? {
-            Some((_, bytes)) => Some(str::from_utf8(bytes)?),
+        let document_id = match self.external_ids.move_on_next()? {
+            Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?,
             None => None,
         };
 
-        match document.zip(external_id) {
-            Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })),
+        match document.zip(document_id) {
+            Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })),
             None => Ok(None),
         }
     }
diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs
index 7bd6dbd51..70b8b0131 100644
--- a/milli/src/documents/reader.rs
+++ b/milli/src/documents/reader.rs
@@ -1,5 +1,5 @@
 use std::convert::TryInto;
-use std::{error, fmt, io, str};
+use std::{error, fmt, io};
 
 use obkv::KvReader;
 
@@ -95,7 +95,7 @@ impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
 #[derive(Debug)]
 pub enum DocumentsBatchCursorError {
     Grenad(grenad::Error),
-    Utf8(str::Utf8Error),
+    SerdeJson(serde_json::Error),
 }
 
 impl From<grenad::Error> for DocumentsBatchCursorError {
@@ -104,9 +104,9 @@ impl From<grenad::Error> for DocumentsBatchCursorError {
     }
 }
 
-impl From<str::Utf8Error> for DocumentsBatchCursorError {
-    fn from(error: str::Utf8Error) -> DocumentsBatchCursorError {
-        DocumentsBatchCursorError::Utf8(error)
+impl From<serde_json::Error> for DocumentsBatchCursorError {
+    fn from(error: serde_json::Error) -> DocumentsBatchCursorError {
+        DocumentsBatchCursorError::SerdeJson(error)
     }
 }
 
@@ -116,7 +116,7 @@ impl fmt::Display for DocumentsBatchCursorError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
             DocumentsBatchCursorError::Grenad(e) => e.fmt(f),
-            DocumentsBatchCursorError::Utf8(e) => e.fmt(f),
+            DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f),
         }
     }
 }
diff --git a/milli/src/error.rs b/milli/src/error.rs
index 0419ceeda..0abb41eec 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -217,7 +217,7 @@ impl From<DocumentsBatchCursorError> for Error {
     fn from(error: DocumentsBatchCursorError) -> Error {
         match error {
             DocumentsBatchCursorError::Grenad(e) => Error::from(e),
-            DocumentsBatchCursorError::Utf8(e) => Error::from(e),
+            DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)),
         }
     }
 }
diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index 5d00565a8..1a0c31c24 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -2,6 +2,7 @@ use std::io::{Read, Seek};
 use std::result::Result as StdResult;
 use std::{fmt, iter};
 
+use serde::{Deserialize, Serialize};
 use serde_json::Value;
 
 use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader};
@@ -89,14 +90,15 @@ pub fn enrich_documents_batch<R: Read + Seek>(
             Err(user_error) => return Ok(Err(user_error)),
         };
 
-        external_ids.insert(count.to_be_bytes(), document_id.value())?;
-
         if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) {
             if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? {
                 return Ok(Err(UserError::from(user_error)));
             }
         }
 
+        let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?;
+        external_ids.insert(count.to_be_bytes(), document_id)?;
+
         count += 1;
     }
 
@@ -210,7 +212,7 @@ impl PrimaryKey<'_> {
 ///
 /// In case the document id has been auto-generated, the document nth is kept to help
 /// users debug if there is an issue with the document itself.
-#[derive(Clone)]
+#[derive(Serialize, Deserialize, Clone)]
 pub enum DocumentId {
     Retrieved { value: String },
     Generated { value: String, document_nth: u32 },
@@ -225,16 +227,20 @@ impl DocumentId {
         DocumentId::Generated { value, document_nth }
     }
 
-    fn value(&self) -> &str {
+    fn debug(&self) -> String {
+        format!("{:?}", self)
+    }
+
+    pub fn is_generated(&self) -> bool {
+        matches!(self, DocumentId::Generated { .. })
+    }
+
+    pub fn value(&self) -> &str {
         match self {
             DocumentId::Retrieved { value } => value,
             DocumentId::Generated { value, .. } => value,
         }
     }
-
-    fn debug(&self) -> String {
-        format!("{:?}", self)
-    }
 }
 
 impl fmt::Debug for DocumentId {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index db1a768e6..615e1dfc7 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -22,7 +22,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
 use self::enrich::enrich_documents_batch;
 pub use self::enrich::{
     extract_float_from_value, validate_document_id, validate_document_id_value,
-    validate_geo_from_json,
+    validate_geo_from_json, DocumentId,
 };
 pub use self::helpers::{
     as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset,
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index e82556ec7..a34295a50 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -153,8 +153,9 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
 
         let primary_key = cursor.primary_key().to_string();
-        self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
         let primary_key_id_nested = primary_key.contains('.');
+        let primary_key_id =
+            self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
 
         let mut flattened_document = None;
         let mut obkv_buffer = Vec::new();
@@ -162,7 +163,7 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mut documents_count = 0;
         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
         while let Some(enriched_document) = cursor.next_enriched_document()? {
-            let EnrichedDocument { document, external_id } = enriched_document;
+            let EnrichedDocument { document, document_id } = enriched_document;
 
             let mut field_buffer_cache = drop_and_reuse(field_buffer);
             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
@@ -171,6 +172,14 @@ impl<'a, 'i> Transform<'a, 'i> {
                 });
             }
 
+            // When the document id has been auto-generated by the `enrich_documents_batch`
+            // we must insert this document id into the remaped document.
+            let external_id = document_id.value();
+            if document_id.is_generated() {
+                let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?;
+                field_buffer_cache.push((primary_key_id, Cow::from(docid)));
+            }
+
             for (k, v) in document.iter() {
                 let mapped_id =
                     *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?;
diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs
index 965ed4fd2..1bf27a5f0 100644
--- a/milli/src/update/mod.rs
+++ b/milli/src/update/mod.rs
@@ -3,7 +3,7 @@ pub use self::clear_documents::ClearDocuments;
 pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult};
 pub use self::facets::Facets;
 pub use self::index_documents::{
-    DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
+    DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod,
 };
 pub use self::indexer_config::IndexerConfig;
 pub use self::settings::{Setting, Settings};

From 5d149d631f7736443d5c871a3442bfd99dbaeb48 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Thu, 30 Jun 2022 15:13:50 +0200
Subject: [PATCH 26/35] Remove tests for a function that no more exists

---
 milli/src/update/index_documents/transform.rs | 85 -------------------
 1 file changed, 85 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index a34295a50..6bf3dde43 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -729,88 +729,3 @@ impl TransformOutput {
             .collect())
     }
 }
-
-// #[cfg(test)]
-// mod test {
-//     use super::*;
-
-//     mod compute_primary_key {
-//         use big_s::S;
-
-//         use super::{compute_primary_key_pair, FieldsIdsMap};
-
-//         #[test]
-//         fn should_return_primary_key_if_is_some() {
-//             let mut fields_map = FieldsIdsMap::new();
-//             fields_map.insert("toto").unwrap();
-//             let result = compute_primary_key_pair(
-//                 Some("toto"),
-//                 &mut fields_map,
-//                 Some("tata".to_string()),
-//                 false,
-//             );
-//             assert_eq!(result.unwrap(), (0, "toto".to_string()));
-//             assert_eq!(fields_map.len(), 1);
-
-//             // and with nested fields
-//             let mut fields_map = FieldsIdsMap::new();
-//             fields_map.insert("toto.tata").unwrap();
-//             let result = compute_primary_key_pair(
-//                 Some("toto.tata"),
-//                 &mut fields_map,
-//                 Some(S("titi")),
-//                 false,
-//             );
-//             assert_eq!(result.unwrap(), (0, "toto.tata".to_string()));
-//             assert_eq!(fields_map.len(), 1);
-//         }
-
-//         #[test]
-//         fn should_return_alternative_if_primary_is_none() {
-//             let mut fields_map = FieldsIdsMap::new();
-//             let result =
-//                 compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false);
-//             assert_eq!(result.unwrap(), (0, S("tata")));
-//             assert_eq!(fields_map.len(), 1);
-//         }
-
-//         #[test]
-//         fn should_return_default_if_both_are_none() {
-//             let mut fields_map = FieldsIdsMap::new();
-//             let result = compute_primary_key_pair(None, &mut fields_map, None, true);
-//             assert_eq!(result.unwrap(), (0, S("id")));
-//             assert_eq!(fields_map.len(), 1);
-//         }
-
-//         #[test]
-//         fn should_return_err_if_both_are_none_and_recompute_is_false() {
-//             let mut fields_map = FieldsIdsMap::new();
-//             let result = compute_primary_key_pair(None, &mut fields_map, None, false);
-//             assert!(result.is_err());
-//             assert_eq!(fields_map.len(), 0);
-//         }
-//     }
-
-//     mod primary_key_inference {
-//         use big_s::S;
-//         use bimap::BiHashMap;
-
-//         use crate::documents::DocumentsBatchIndex;
-//         use crate::update::index_documents::transform::find_primary_key;
-
-//         #[test]
-//         fn primary_key_infered_on_first_field() {
-//             // We run the test multiple times to change the order in which the fields are iterated upon.
-//             for _ in 1..50 {
-//                 let mut map = BiHashMap::new();
-//                 map.insert(1, S("fakeId"));
-//                 map.insert(2, S("fakeId"));
-//                 map.insert(3, S("fakeId"));
-//                 map.insert(4, S("fakeId"));
-//                 map.insert(0, S("realId"));
-
-//                 assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
-//             }
-//         }
-//     }
-// }

From 2eec290424b7508c3953be301e070028fdf900f7 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 11 Jul 2022 16:36:23 +0200
Subject: [PATCH 27/35] Check the validity of the latitute and longitude
 numbers

---
 milli/src/error.rs                            |  6 ++---
 milli/src/update/index_documents/enrich.rs    | 24 ++++++++++++++-----
 .../extract/extract_geo_points.rs             |  6 ++---
 milli/src/update/index_documents/mod.rs       |  2 +-
 4 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/milli/src/error.rs b/milli/src/error.rs
index 0abb41eec..80c923bd9 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -152,11 +152,11 @@ pub enum GeoError {
     MissingLatitude { document_id: Value },
     #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")]
     MissingLongitude { document_id: Value },
-    #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")]
+    #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")]
     BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value },
-    #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")]
+    #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")]
     BadLatitude { document_id: Value, value: Value },
-    #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")]
+    #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")]
     BadLongitude { document_id: Value, value: Value },
 }
 
diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index 1a0c31c24..28318881b 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -326,11 +326,23 @@ pub fn validate_document_id_value(document_id: Value) -> Result<StdResult<String
 
 /// Try to extract an `f64` from a JSON `Value` and return the `Value`
 /// in the `Err` variant if it failed.
-pub fn extract_float_from_value(value: Value) -> StdResult<f64, Value> {
-    match value {
-        Value::Number(ref n) => n.as_f64().ok_or(value),
-        Value::String(ref s) => s.parse::<f64>().map_err(|_| value),
-        value => Err(value),
+pub fn extract_finite_float_from_value(value: Value) -> StdResult<f64, Value> {
+    let number = match value {
+        Value::Number(ref n) => match n.as_f64() {
+            Some(number) => number,
+            None => return Err(value),
+        },
+        Value::String(ref s) => match s.parse::<f64>() {
+            Ok(number) => number,
+            Err(_) => return Err(value),
+        },
+        value => return Err(value),
+    };
+
+    if number.is_finite() {
+        Ok(number)
+    } else {
+        Err(value)
     }
 }
 
@@ -340,7 +352,7 @@ pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result<StdResult
     match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? {
         Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) {
             (Some(lat), Some(lng)) => {
-                match (extract_float_from_value(lat), extract_float_from_value(lng)) {
+                match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) {
                     (Ok(_), Ok(_)) => Ok(Ok(())),
                     (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })),
                     (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })),
diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs
index 5a6de236b..47085144a 100644
--- a/milli/src/update/index_documents/extract/extract_geo_points.rs
+++ b/milli/src/update/index_documents/extract/extract_geo_points.rs
@@ -6,7 +6,7 @@ use serde_json::Value;
 
 use super::helpers::{create_writer, writer_into_reader, GrenadParameters};
 use crate::error::GeoError;
-use crate::update::index_documents::extract_float_from_value;
+use crate::update::index_documents::extract_finite_float_from_value;
 use crate::{FieldId, InternalError, Result};
 
 /// Extracts the geographical coordinates contained in each document under the `_geo` field.
@@ -40,12 +40,12 @@ pub fn extract_geo_points<R: io::Read + io::Seek>(
 
         if let Some((lat, lng)) = lat.zip(lng) {
             // then we extract the values
-            let lat = extract_float_from_value(
+            let lat = extract_finite_float_from_value(
                 serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?,
             )
             .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?;
 
-            let lng = extract_float_from_value(
+            let lng = extract_finite_float_from_value(
                 serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?,
             )
             .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?;
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 615e1dfc7..652c1e72b 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -21,7 +21,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk};
 
 use self::enrich::enrich_documents_batch;
 pub use self::enrich::{
-    extract_float_from_value, validate_document_id, validate_document_id_value,
+    extract_finite_float_from_value, validate_document_id, validate_document_id_value,
     validate_geo_from_json, DocumentId,
 };
 pub use self::helpers::{

From dc61105554954801cce37cc4cca32b1ef4814347 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 11 Jul 2022 17:44:08 +0200
Subject: [PATCH 28/35] Fix the nested document id fetching function

---
 milli/src/update/index_documents/enrich.rs | 4 +++-
 milli/src/update/index_documents/mod.rs    | 6 +++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index 28318881b..0298f0532 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -204,7 +204,9 @@ impl PrimaryKey<'_> {
     /// can have depending of the first level name and deepnes of the objects.
     fn possible_level_names(&self) -> impl Iterator<Item = (&str, &str)> + '_ {
         let name = self.name();
-        iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL))
+        name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL)
+            .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..]))
+            .chain(iter::once((name, "")))
     }
 }
 
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 652c1e72b..54599acce 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1198,7 +1198,7 @@ mod tests {
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
-            r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `"lol"`."#
+            r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."#
         );
 
         let documents = documents!([
@@ -1212,7 +1212,7 @@ mod tests {
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
-            r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `[12,13]`."#
+            r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."#
         );
 
         let documents = documents!([
@@ -1226,7 +1226,7 @@ mod tests {
         let error = builder.execute().unwrap_err();
         assert_eq!(
             &error.to_string(),
-            r#"Could not parse longitude in the document with the id: `0`. Was expecting a number but instead got `"hello"`."#
+            r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."#
         );
     }
 

From a892a4a79c5933a2eedb04d6b46bd19d66229729 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Mon, 11 Jul 2022 18:38:50 +0200
Subject: [PATCH 29/35] Introduce a function to extend from a JSON array of
 objects

---
 benchmarks/benches/utils.rs       |  5 +-
 cli/src/main.rs                   |  5 +-
 http-ui/src/main.rs               |  5 +-
 milli/src/documents/builder.rs    |  9 ++++
 milli/src/documents/mod.rs        |  1 +
 milli/src/documents/serde_impl.rs | 76 +++++++++++++++++++++++++++++++
 6 files changed, 89 insertions(+), 12 deletions(-)
 create mode 100644 milli/src/documents/serde_impl.rs

diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs
index 630e17943..51178b43b 100644
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -164,11 +164,8 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
 
 fn documents_from_json(reader: impl BufRead) -> anyhow::Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
-    let list: Vec<Object> = serde_json::from_reader(reader)?;
 
-    for object in list {
-        documents.append_json_object(&object)?;
-    }
+    documents.append_json_array(reader)?;
 
     documents.into_inner().map_err(Into::into)
 }
diff --git a/cli/src/main.rs b/cli/src/main.rs
index 0d197af17..35fef95c6 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -337,11 +337,8 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
 
 fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
-    let list: Vec<Object> = serde_json::from_reader(reader)?;
 
-    for object in list {
-        documents.append_json_object(&object)?;
-    }
+    documents.append_json_array(reader)?;
 
     documents.into_inner().map_err(Into::into)
 }
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index 117aa31e8..83fce9a9c 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -1042,11 +1042,8 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
 
 fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
     let mut documents = DocumentsBatchBuilder::new(Vec::new());
-    let list: Vec<Object> = serde_json::from_reader(reader)?;
 
-    for object in list {
-        documents.append_json_object(&object)?;
-    }
+    documents.append_json_array(reader)?;
 
     documents.into_inner().map_err(Into::into)
 }
diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 589e52269..bb9d6aa68 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -1,9 +1,11 @@
 use std::io::{self, Write};
 
 use grenad::{CompressionType, WriterBuilder};
+use serde::de::Deserializer;
 use serde_json::{to_writer, Value};
 
 use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
+use crate::documents::serde_impl::DocumentVisitor;
 use crate::Object;
 
 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
@@ -78,6 +80,13 @@ impl<W: Write> DocumentsBatchBuilder<W> {
         Ok(())
     }
 
+    /// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly.
+    pub fn append_json_array<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
+        let mut de = serde_json::Deserializer::from_reader(reader);
+        let mut visitor = DocumentVisitor::new(self);
+        de.deserialize_any(&mut visitor)?
+    }
+
     /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly.
     pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> {
         // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 43bfc1c20..c5ff7a120 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -1,6 +1,7 @@
 mod builder;
 mod enriched;
 mod reader;
+mod serde_impl;
 
 use std::fmt::{self, Debug};
 use std::io;
diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs
new file mode 100644
index 000000000..d4abdc844
--- /dev/null
+++ b/milli/src/documents/serde_impl.rs
@@ -0,0 +1,76 @@
+use std::fmt;
+use std::io::Write;
+
+use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
+
+use super::Error;
+use crate::documents::DocumentsBatchBuilder;
+use crate::Object;
+
+macro_rules! tri {
+    ($e:expr) => {
+        match $e {
+            Ok(r) => r,
+            Err(e) => return Ok(Err(e.into())),
+        }
+    };
+}
+
+pub struct DocumentVisitor<'a, W> {
+    inner: &'a mut DocumentsBatchBuilder<W>,
+    object: Object,
+}
+
+impl<'a, W> DocumentVisitor<'a, W> {
+    pub fn new(inner: &'a mut DocumentsBatchBuilder<W>) -> Self {
+        DocumentVisitor { inner, object: Object::new() }
+    }
+}
+
+impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
+    /// This Visitor value is nothing, since it write the value to a file.
+    type Value = Result<(), Error>;
+
+    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+    where
+        A: SeqAccess<'de>,
+    {
+        while let Some(v) = seq.next_element_seed(&mut *self)? {
+            tri!(v)
+        }
+
+        Ok(Ok(()))
+    }
+
+    fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
+    where
+        A: MapAccess<'de>,
+    {
+        self.object.clear();
+        while let Some((key, value)) = map.next_entry()? {
+            self.object.insert(key, value);
+        }
+
+        tri!(self.inner.append_json_object(&self.object));
+
+        Ok(Ok(()))
+    }
+
+    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "a documents, or a sequence of documents.")
+    }
+}
+
+impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
+where
+    W: Write,
+{
+    type Value = Result<(), Error>;
+
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        deserializer.deserialize_map(self)
+    }
+}

From 192793ee38c7d819b5b542c8c7d0f1ae473d9619 Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 12 Jul 2022 12:42:06 +0200
Subject: [PATCH 30/35] Add some tests to check for the nested documents ids

---
 milli/src/update/index_documents/enrich.rs |  9 ++--
 milli/src/update/index_documents/mod.rs    | 52 ++++++++++++++++++++++
 2 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index 0298f0532..d7ab89faa 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -257,12 +257,9 @@ impl fmt::Debug for DocumentId {
 }
 
 fn contained_in(selector: &str, key: &str) -> bool {
-    selector.starts_with(key)
-        && selector[key.len()..]
-            .chars()
-            .next()
-            .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL)
-            .unwrap_or(true)
+    selector.strip_prefix(key).map_or(false, |tail| {
+        tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
+    })
 }
 
 pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec<Value>) {
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 54599acce..c9890f93f 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -1627,6 +1627,58 @@ mod tests {
         assert_eq!(documents_ids, vec![3]);
     }
 
+    #[test]
+    fn retrieve_a_b_nested_document_id() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+        let config = IndexerConfig::default();
+
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = update::Settings::new(&mut wtxn, &index, &config);
+        builder.set_primary_key("a.b".to_owned());
+        builder.execute(|_| ()).unwrap();
+
+        let content = documents!({ "a" : { "b" : { "c" :  1 }}});
+        let indexing_config = IndexDocumentsConfig::default();
+        let builder =
+            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
+                .unwrap();
+        let (_builder, user_error) = builder.add_documents(content).unwrap();
+
+        // There must be an issue with the primary key no present in the given document
+        user_error.unwrap_err();
+    }
+
+    #[test]
+    fn retrieve_a_b_c_nested_document_id() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+        let config = IndexerConfig::default();
+
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = update::Settings::new(&mut wtxn, &index, &config);
+        builder.set_primary_key("a.b.c".to_owned());
+        builder.execute(|_| ()).unwrap();
+
+        let content = documents!({ "a" : { "b" : { "c" :  1 }}});
+        let indexing_config = IndexDocumentsConfig::default();
+        let builder =
+            IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ())
+                .unwrap();
+        let (builder, user_error) = builder.add_documents(content).unwrap();
+        user_error.unwrap();
+        builder.execute().unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        let external_documents_ids = index.external_documents_ids(&rtxn).unwrap();
+        assert!(external_documents_ids.get("1").is_some());
+    }
+
     #[test]
     fn test_facets_generation() {
         let path = tempfile::tempdir().unwrap();

From 25e768f31c32d8718132be124ab2e32c94c15f8f Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 12 Jul 2022 14:41:33 +0200
Subject: [PATCH 31/35] Fix another issue with the nested primary key selector

---
 milli/src/update/index_documents/enrich.rs | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index d7ab89faa..56f8fa4c0 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -256,7 +256,7 @@ impl fmt::Debug for DocumentId {
     }
 }
 
-fn contained_in(selector: &str, key: &str) -> bool {
+fn starts_with(selector: &str, key: &str) -> bool {
     selector.strip_prefix(key).map_or(false, |tail| {
         tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true)
     })
@@ -282,12 +282,7 @@ pub fn fetch_matching_values_in_object(
             format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key)
         };
 
-        // here if the user only specified `doggo` we need to iterate in all the fields of `doggo`
-        // so we check the contained_in on both side.
-        let should_continue =
-            contained_in(selector, &base_key) || contained_in(&base_key, selector);
-
-        if should_continue {
+        if starts_with(selector, &base_key) {
             match value {
                 Value::Object(object) => {
                     fetch_matching_values_in_object(object, selector, &base_key, output)

From 448114cc1c1edd1781a830590f2c888dcdab775d Mon Sep 17 00:00:00 2001
From: Kerollmops <clement@meilisearch.com>
Date: Tue, 12 Jul 2022 15:22:09 +0200
Subject: [PATCH 32/35] Fix the benchmarks with the new indexation API

---
 benchmarks/benches/indexing.rs             | 40 +++++++++++++---------
 benchmarks/benches/utils.rs                |  4 +--
 milli/src/update/index_documents/enrich.rs |  6 ++--
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs
index 80c7ba0ed..81b21b5ea 100644
--- a/benchmarks/benches/indexing.rs
+++ b/benchmarks/benches/indexing.rs
@@ -170,12 +170,13 @@ fn reindexing_songs_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -185,12 +186,13 @@ fn reindexing_songs_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -460,12 +462,13 @@ fn reindexing_wiki(c: &mut Criterion) {
                 let indexing_config =
                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -476,12 +479,13 @@ fn reindexing_wiki(c: &mut Criterion) {
                 let indexing_config =
                     IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() };
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -680,12 +684,13 @@ fn reindexing_movies_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::MOVIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -695,12 +700,13 @@ fn reindexing_movies_default(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::MOVIES, "json");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
                 wtxn.commit().unwrap();
 
@@ -1079,12 +1085,13 @@ fn reindexing_geo(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
@@ -1095,12 +1102,13 @@ fn reindexing_geo(c: &mut Criterion) {
                 let config = IndexerConfig::default();
                 let indexing_config = IndexDocumentsConfig::default();
                 let mut wtxn = index.write_txn().unwrap();
-                let mut builder =
+                let builder =
                     IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ())
                         .unwrap();
 
                 let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl");
-                builder.add_documents(documents).unwrap();
+                let (builder, user_error) = builder.add_documents(documents).unwrap();
+                user_error.unwrap();
                 builder.execute().unwrap();
 
                 wtxn.commit().unwrap();
diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs
index 51178b43b..fba05edbe 100644
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -1,7 +1,7 @@
 #![allow(dead_code)]
 
 use std::fs::{create_dir_all, remove_dir_all, File};
-use std::io::{self, BufReader, Cursor, Read, Seek};
+use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
 use std::num::ParseFloatError;
 use std::path::Path;
 
@@ -138,7 +138,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) {
     }
 }
 
-pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<impl BufRead + Seek> {
+pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader<impl BufRead + Seek> {
     let reader =
         File::open(filename).expect(&format!("could not find the dataset in: {}", filename));
     let reader = BufReader::new(reader);
diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index 56f8fa4c0..51495c598 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -30,7 +30,7 @@ pub fn enrich_documents_batch<R: Read + Seek>(
     let mut cursor = reader.into_cursor();
     let mut documents_batch_index = cursor.documents_batch_index().clone();
     let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
-    let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH];
+    let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
 
     // The primary key *field id* that has already been set for this index or the one
     // we will guess by searching for the first key that contains "id" as a substring.
@@ -119,7 +119,7 @@ fn fetch_or_generate_document_id(
     documents_batch_index: &DocumentsBatchIndex,
     primary_key: PrimaryKey,
     autogenerate_docids: bool,
-    uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH],
+    uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH],
     count: u32,
 ) -> Result<StdResult<DocumentId, UserError>> {
     match primary_key {
@@ -134,7 +134,7 @@ fn fetch_or_generate_document_id(
                     }
                 }
                 None if autogenerate_docids => {
-                    let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer);
+                    let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer);
                     Ok(Ok(DocumentId::generated(uuid.to_string(), count)))
                 }
                 None => Ok(Err(UserError::MissingDocumentId {

From ab1571cdec615d025eeab8c924df74a5c485dceb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Mon, 18 Jul 2022 12:41:58 +0200
Subject: [PATCH 33/35] Simplify Transform::read_documents, enabled by enriched
 documents reader

---
 milli/src/update/index_documents/transform.rs | 97 +++----------------
 1 file changed, 11 insertions(+), 86 deletions(-)

diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 6bf3dde43..d03a803fd 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -153,18 +153,18 @@ impl<'a, 'i> Transform<'a, 'i> {
         let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
 
         let primary_key = cursor.primary_key().to_string();
-        let primary_key_id_nested = primary_key.contains('.');
         let primary_key_id =
             self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?;
 
-        let mut flattened_document = None;
         let mut obkv_buffer = Vec::new();
-        let mut flattened_obkv_buffer = Vec::new();
         let mut documents_count = 0;
+        let mut docid_buffer: Vec<u8> = Vec::new();
         let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new();
         while let Some(enriched_document) = cursor.next_enriched_document()? {
             let EnrichedDocument { document, document_id } = enriched_document;
 
+            // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer
+            // does not keep references from the cursor between loop iterations
             let mut field_buffer_cache = drop_and_reuse(field_buffer);
             if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) {
                 progress_callback(UpdateIndexingStep::RemapDocumentAddition {
@@ -176,8 +176,9 @@ impl<'a, 'i> Transform<'a, 'i> {
             // we must insert this document id into the remaped document.
             let external_id = document_id.value();
             if document_id.is_generated() {
-                let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?;
-                field_buffer_cache.push((primary_key_id, Cow::from(docid)));
+                serde_json::to_writer(&mut docid_buffer, external_id)
+                    .map_err(InternalError::SerdeJson)?;
+                field_buffer_cache.push((primary_key_id, Cow::from(&docid_buffer)));
             }
 
             for (k, v) in document.iter() {
@@ -186,22 +187,6 @@ impl<'a, 'i> Transform<'a, 'i> {
                 field_buffer_cache.push((mapped_id, Cow::from(v)));
             }
 
-            // We need to make sure that every document has a primary key. After we have remapped
-            // all the fields in the document, we try to find the primary key value. If we can find
-            // it, transform it into a string and validate it, and then update it in the
-            // document. If none is found, and we were told to generate missing document ids, then
-            // we create the missing field, and update the new document.
-            if primary_key_id_nested {
-                let mut field_buffer_cache = field_buffer_cache.clone();
-                self.flatten_from_field_mapping(
-                    &mapping,
-                    &document,
-                    &mut flattened_obkv_buffer,
-                    &mut field_buffer_cache,
-                )?;
-                flattened_document = Some(&flattened_obkv_buffer);
-            };
-
             // Insertion in a obkv need to be done with keys ordered. For now they are ordered
             // according to the document addition key order, so we sort it according to the
             // fieldids map keys order.
@@ -256,18 +241,12 @@ impl<'a, 'i> Transform<'a, 'i> {
             }
 
             // We use the extracted/generated user id as the key for this document.
-            self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?;
+            self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?;
             documents_count += 1;
 
-            if let Some(flatten) = flattened_document {
-                self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?;
-            } else {
-                match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
-                    Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
-                    None => {
-                        self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())?
-                    }
-                }
+            match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? {
+                Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?,
+                None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?,
             }
 
             progress_callback(UpdateIndexingStep::RemapDocumentAddition {
@@ -275,6 +254,7 @@ impl<'a, 'i> Transform<'a, 'i> {
             });
 
             field_buffer = drop_and_reuse(field_buffer_cache);
+            docid_buffer.clear();
             obkv_buffer.clear();
         }
 
@@ -345,61 +325,6 @@ impl<'a, 'i> Transform<'a, 'i> {
         Ok(Some(buffer))
     }
 
-    // Flatten a document from a field mapping generated by [create_fields_mapping]
-    fn flatten_from_field_mapping(
-        &mut self,
-        mapping: &HashMap<FieldId, FieldId>,
-        obkv: &KvReader<FieldId>,
-        output_buffer: &mut Vec<u8>,
-        field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>,
-    ) -> Result<()> {
-        // store the keys and values of the json + the original obkv
-        let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new();
-
-        // if the primary_key is nested we need to flatten the document before being able to do anything
-        let mut doc = serde_json::Map::new();
-
-        // we recreate a json containing only the fields that needs to be flattened.
-        // all the raw values get inserted directly in the `key_value` vec.
-        for (key, value) in obkv.iter() {
-            if json_depth_checker::should_flatten_from_unchecked_slice(value) {
-                let key =
-                    mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?;
-                let key =
-                    self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId {
-                        field_id: *key,
-                        process: "Flatten from field mapping.",
-                    })?;
-                let value = serde_json::from_slice::<serde_json::Value>(value)
-                    .map_err(InternalError::SerdeJson)?;
-                doc.insert(key.to_string(), value);
-            } else {
-                key_value.push((key, value.into()));
-            }
-        }
-
-        let flattened = flatten_serde_json::flatten(&doc);
-
-        // Once we have the flattened version we insert all the new generated fields_ids
-        // (if any) in the fields ids map and serialize the value.
-        for (key, value) in flattened.into_iter() {
-            let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?;
-            let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?;
-            key_value.push((fid, value.clone().into()));
-
-            if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() {
-                field_buffer_cache.push((fid, value.into()));
-            }
-        }
-
-        // we sort the key. If there was a conflict between the obkv and the new generated value the
-        // keys will be consecutive.
-        key_value.sort_unstable_by_key(|(key, _)| *key);
-
-        Self::create_obkv_from_key_value(&mut key_value, output_buffer)?;
-        Ok(())
-    }
-
     /// Generate an obkv from a slice of key / value sorted by key.
     fn create_obkv_from_key_value(
         key_value: &mut [(FieldId, Cow<[u8]>)],

From fc9f3f31e74edda92ad1beb31226e9cb100781e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic@meilisearch.com>
Date: Mon, 18 Jul 2022 16:08:01 +0200
Subject: [PATCH 34/35] Change DocumentsBatchReader to access cursor and index
 at same time

Otherwise it is not possible to iterate over all documents while
using the fields index at the same time.
---
 milli/src/documents/builder.rs                | 71 ++++++++++---------
 milli/src/documents/enriched.rs               | 25 ++-----
 milli/src/documents/mod.rs                    | 16 +++--
 milli/src/documents/reader.rs                 | 20 ++----
 milli/src/update/index_documents/enrich.rs    |  9 +--
 milli/src/update/index_documents/transform.rs |  6 +-
 6 files changed, 65 insertions(+), 82 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index bb9d6aa68..dc027e1b7 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -216,9 +216,9 @@ mod test {
         assert_eq!(builder.documents_count(), 2);
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
         assert_eq!(index.len(), 3);
 
         let document = cursor.next_document().unwrap().unwrap();
@@ -240,9 +240,9 @@ mod test {
         assert_eq!(builder.documents_count(), 2);
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
         assert_eq!(index.len(), 2);
 
         let document = cursor.next_document().unwrap().unwrap();
@@ -264,9 +264,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
 
@@ -292,9 +292,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@@ -319,9 +319,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@@ -346,9 +346,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@@ -373,9 +373,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@@ -400,9 +400,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@@ -427,9 +427,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@@ -454,9 +454,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
@@ -482,8 +482,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         assert!(cursor.next_document().is_err());
     }
@@ -498,9 +499,9 @@ mod test {
         builder.append_csv(csv).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut cursor =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
-        let index = cursor.documents_batch_index().clone();
+        let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
         let doc = cursor.next_document().unwrap().unwrap();
         let val = obkv_to_object(&doc, &index).map(Value::from).unwrap();
diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs
index 4f45a891a..fa21c0f87 100644
--- a/milli/src/documents/enriched.rs
+++ b/milli/src/documents/enriched.rs
@@ -56,14 +56,13 @@ impl<R: io::Read + io::Seek> EnrichedDocumentsBatchReader<R> {
     }
 
     /// This method returns a forward cursor over the enriched documents.
-    pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor<R> {
+    pub fn into_cursor_and_fields_index(
+        self,
+    ) -> (EnrichedDocumentsBatchCursor<R>, DocumentsBatchIndex) {
         let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self;
+        let (documents, fields_index) = documents.into_cursor_and_fields_index();
         external_ids.reset();
-        EnrichedDocumentsBatchCursor {
-            documents: documents.into_cursor(),
-            primary_key,
-            external_ids,
-        }
+        (EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index)
     }
 }
 
@@ -80,23 +79,9 @@ pub struct EnrichedDocumentsBatchCursor<R> {
 }
 
 impl<R> EnrichedDocumentsBatchCursor<R> {
-    pub fn into_reader(self) -> EnrichedDocumentsBatchReader<R> {
-        let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self;
-        EnrichedDocumentsBatchReader {
-            documents: documents.into_reader(),
-            primary_key,
-            external_ids,
-        }
-    }
-
     pub fn primary_key(&self) -> &str {
         &self.primary_key
     }
-
-    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
-        self.documents.documents_batch_index()
-    }
-
     /// Resets the cursor to be able to read from the start again.
     pub fn reset(&mut self) {
         self.documents.reset();
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index c5ff7a120..e766e29cf 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -203,10 +203,11 @@ mod test {
         builder.append_json_object(value.as_object().unwrap()).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut documents =
-            DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor();
+        let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
 
-        assert_eq!(documents.documents_batch_index().iter().count(), 5);
+        assert_eq!(index.iter().count(), 5);
         let reader = documents.next_document().unwrap().unwrap();
         assert_eq!(reader.iter().count(), 5);
         assert!(documents.next_document().unwrap().is_none());
@@ -226,9 +227,10 @@ mod test {
         builder.append_json_object(doc2.as_object().unwrap()).unwrap();
         let vector = builder.into_inner().unwrap();
 
-        let mut documents =
-            DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor();
-        assert_eq!(documents.documents_batch_index().iter().count(), 2);
+        let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector))
+            .unwrap()
+            .into_cursor_and_fields_index();
+        assert_eq!(index.iter().count(), 2);
         let reader = documents.next_document().unwrap().unwrap();
         assert_eq!(reader.iter().count(), 1);
         assert!(documents.next_document().unwrap().is_some());
@@ -243,7 +245,7 @@ mod test {
             }
         }]);
 
-        let mut cursor = docs_reader.into_cursor();
+        let (mut cursor, _) = docs_reader.into_cursor_and_fields_index();
         let doc = cursor.next_document().unwrap().unwrap();
         let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap();
         assert_eq!(nested, json!({ "toto": ["hello"] }));
diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs
index 70b8b0131..a8a4c662d 100644
--- a/milli/src/documents/reader.rs
+++ b/milli/src/documents/reader.rs
@@ -17,6 +17,10 @@ pub struct DocumentsBatchReader<R> {
 }
 
 impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
+    pub fn new(cursor: DocumentsBatchCursor<R>, fields_index: DocumentsBatchIndex) -> Self {
+        Self { cursor: cursor.cursor, fields_index }
+    }
+
     /// Construct a `DocumentsReader` from a reader.
     ///
     /// It first retrieves the index, then moves to the first document. Use the `into_cursor`
@@ -46,30 +50,20 @@ impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
     }
 
     /// This method returns a forward cursor over the documents.
-    pub fn into_cursor(self) -> DocumentsBatchCursor<R> {
+    pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor<R>, DocumentsBatchIndex) {
         let DocumentsBatchReader { cursor, fields_index } = self;
-        let mut cursor = DocumentsBatchCursor { cursor, fields_index };
+        let mut cursor = DocumentsBatchCursor { cursor };
         cursor.reset();
-        cursor
+        (cursor, fields_index)
     }
 }
 
 /// A forward cursor over the documents in a `DocumentsBatchReader`.
 pub struct DocumentsBatchCursor<R> {
     cursor: grenad::ReaderCursor<R>,
-    fields_index: DocumentsBatchIndex,
 }
 
 impl<R> DocumentsBatchCursor<R> {
-    pub fn into_reader(self) -> DocumentsBatchReader<R> {
-        let DocumentsBatchCursor { cursor, fields_index, .. } = self;
-        DocumentsBatchReader { cursor, fields_index }
-    }
-
-    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
-        &self.fields_index
-    }
-
     /// Resets the cursor to be able to read from the start again.
     pub fn reset(&mut self) {
         self.cursor.reset();
diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs
index 51495c598..7c9a016d8 100644
--- a/milli/src/update/index_documents/enrich.rs
+++ b/milli/src/update/index_documents/enrich.rs
@@ -27,8 +27,8 @@ pub fn enrich_documents_batch<R: Read + Seek>(
     autogenerate_docids: bool,
     reader: DocumentsBatchReader<R>,
 ) -> Result<StdResult<EnrichedDocumentsBatchReader<R>, UserError>> {
-    let mut cursor = reader.into_cursor();
-    let mut documents_batch_index = cursor.documents_batch_index().clone();
+    let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index();
+
     let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?;
     let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH];
 
@@ -103,9 +103,10 @@ pub fn enrich_documents_batch<R: Read + Seek>(
     }
 
     let external_ids = writer_into_reader(external_ids)?;
+    let primary_key_name = primary_key.name().to_string();
     let reader = EnrichedDocumentsBatchReader::new(
-        cursor.into_reader(),
-        primary_key.name().to_string(),
+        DocumentsBatchReader::new(cursor, documents_batch_index),
+        primary_key_name,
         external_ids,
     )?;
 
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index d03a803fd..0de90924a 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -146,11 +146,11 @@ impl<'a, 'i> Transform<'a, 'i> {
         R: Read + Seek,
         F: Fn(UpdateIndexingStep) + Sync,
     {
-        let mut cursor = reader.into_cursor();
-        let fields_index = cursor.documents_batch_index();
+        let (mut cursor, fields_index) = reader.into_cursor_and_fields_index();
+
         let external_documents_ids = self.index.external_documents_ids(wtxn)?;
 
-        let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?;
+        let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?;
 
         let primary_key = cursor.primary_key().to_string();
         let primary_key_id =

From 41a0ce07cb5f07b886df99ddbf1889164486b0a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= <loic.lecrenier@me.com>
Date: Wed, 20 Jul 2022 16:20:35 +0200
Subject: [PATCH 35/35] Add a code comment, as suggested in PR review

Co-authored-by: Many the fish <many@meilisearch.com>
---
 milli/src/documents/builder.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index dc027e1b7..1a57db34b 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -97,6 +97,7 @@ impl<W: Write> DocumentsBatchBuilder<W> {
             .map(|(k, t)| (self.fields_index.insert(k), t))
             .enumerate()
             .collect();
+        // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
         typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid);
 
         let mut record = csv::StringRecord::new();