Rework the DocumentsBatchBuilder/Reader to use grenad

2025-06-14 03:51:38 +02:00 · 2022-06-14 16:03:48 +02:00 · 2022-06-14 16:03:48 +02:00 · 419ce3966c
commit 419ce3966c
parent eb63af1f10
4 changed files with 218 additions and 356 deletions
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@ -1,157 +1,159 @@
-use std::collections::BTreeMap;
+use std::io::{self, Write};
 use std::io;
 use std::io::{Cursor, Write};
-use byteorder::{BigEndian, WriteBytesExt};
+use grenad::{CompressionType, WriterBuilder};
-use serde::Deserializer;
+use serde_json::{to_writer, Map, Value};
 use serde_json::Value;
-use super::serde_impl::DocumentVisitor;
+use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
 use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 use crate::FieldId;
 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
 /// format used by milli.
 ///
-/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
+/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader`
-/// iterate over the documents.
+/// to iterate over the documents.
 ///
 /// ## example:
 /// ```
 /// use milli::documents::DocumentBatchBuilder;
 /// use serde_json::json;
-/// use std::io::Cursor;
+/// use milli::documents::DocumentsBatchBuilder;
 ///
-/// let json = r##"{"id": 1, "name": "foo"}"##;
+/// let json = json!({ "id": 1, "name": "foo" });
-/// let mut writer = Cursor::new(Vec::new());
+///
-/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
+/// let mut builder = DocumentsBatchBuilder::new(Vec::new());
-/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
+/// builder.append_json_object(json.as_object().unwrap()).unwrap();
-/// builder.finish().unwrap();
+/// let _vector = builder.into_inner().unwrap();
 /// ```
-pub struct DocumentBatchBuilder<W> {
+pub struct DocumentsBatchBuilder<W> {
-    inner: ByteCounter<W>,
+    /// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`.
-    index: DocumentsBatchIndex,
+    writer: grenad::Writer<W>,
    /// A map that creates the relation between field ids and field names.
    fields_index: DocumentsBatchIndex,
    /// The number of documents that were added to this builder,
    /// it doesn't take the primary key of the documents into account at this point.
    documents_count: u32,
    /// A buffer to store a temporary obkv buffer and avoid reallocating.
    obkv_buffer: Vec<u8>,
    /// A buffer to serialize the values and avoid reallocating,
    /// serialized values are stored in an obkv.
    value_buffer: Vec<u8>,
    values: BTreeMap<FieldId, Value>,
    count: usize,
 }
-impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
+impl<W: Write> DocumentsBatchBuilder<W> {
-    pub fn new(writer: W) -> Result<Self, Error> {
+    pub fn new(writer: W) -> DocumentsBatchBuilder<W> {
-        let index = DocumentsBatchIndex::default();
+        DocumentsBatchBuilder {
-        let mut writer = ByteCounter::new(writer);
+            writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer),
-        // add space to write the offset of the metadata at the end of the writer
+            fields_index: DocumentsBatchIndex::default(),
-        writer.write_u64::<BigEndian>(0)?;
+            documents_count: 0,
        Ok(Self {
            inner: writer,
            index,
            obkv_buffer: Vec::new(),
            value_buffer: Vec::new(),
-            values: BTreeMap::new(),
+        }
            count: 0,
        })
    }
-    /// Returns the number of documents that have been written to the builder.
+    /// Returns the number of documents inserted into this builder.
-    pub fn len(&self) -> usize {
+    pub fn documents_count(&self) -> u32 {
-        self.count
+        self.documents_count
    }
-    /// This method must be called after the document addition is terminated. It will put the
+    /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly.
-    /// metadata at the end of the file, and write the metadata offset at the beginning on the
+    pub fn append_json_object(&mut self, object: &Map<String, Value>) -> io::Result<()> {
-    /// file.
+        // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
-    pub fn finish(self) -> Result<usize, Error> {
+        let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect();
-        let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
+        fields_ids.sort_unstable();
-        let meta = DocumentsMetadata { count, index };
+        self.obkv_buffer.clear();
        let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
        for field_id in fields_ids {
            let key = self.fields_index.name(field_id).unwrap();
            self.value_buffer.clear();
            to_writer(&mut self.value_buffer, &object[key])?;
            writer.insert(field_id, &self.value_buffer)?;
        }
-        bincode::serialize_into(&mut writer, &meta)?;
+        let internal_id = self.documents_count.to_be_bytes();
        let document_bytes = writer.into_inner()?;
        self.writer.insert(internal_id, &document_bytes)?;
        self.documents_count += 1;
-        writer.seek(io::SeekFrom::Start(0))?;
+        Ok(())
        writer.write_u64::<BigEndian>(offset as u64)?;
        writer.flush()?;
        Ok(count)
    }
-    /// Extends the builder with json documents from a reader.
+    /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly.
-    pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
+    pub fn append_csv<R: io::Read>(&mut self, mut reader: csv::Reader<R>) -> Result<(), Error> {
-        let mut de = serde_json::Deserializer::from_reader(reader);
+        // Make sure that we insert the fields ids in order as the obkv writer has this requirement.
-
+        let mut typed_fields_ids: Vec<_> = reader
        let mut visitor = DocumentVisitor {
            inner: &mut self.inner,
            index: &mut self.index,
            obkv_buffer: &mut self.obkv_buffer,
            value_buffer: &mut self.value_buffer,
            values: &mut self.values,
            count: &mut self.count,
        };
        de.deserialize_any(&mut visitor).map_err(Error::JsonError)?
    }
    /// Creates a builder from a reader of CSV documents.
    ///
    /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
    /// optimisations, and extending from another CSV is not allowed.
    pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
        let mut this = Self::new(writer)?;
        // Ensure that this is the first and only addition made with this builder
        debug_assert!(this.index.is_empty());
        let mut records = csv::Reader::from_reader(reader);
        let headers = records
            .headers()?
            .into_iter()
            .map(parse_csv_header)
-            .map(|(k, t)| (this.index.insert(k), t))
+            .map(|(k, t)| (self.fields_index.insert(k), t))
-            .collect::<BTreeMap<_, _>>();
+            .enumerate()
            .collect();
        typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid);
-        for (i, record) in records.into_records().enumerate() {
+        let mut record = csv::StringRecord::new();
-            let record = record?;
+        let mut line = 0;
-            this.obkv_buffer.clear();
+        while reader.read_record(&mut record)? {
-            let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
+            // We increment here and not at the end of the while loop to take
-            for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
+            // the header offset into account.
-                let value = match ty {
+            line += 1;
            self.obkv_buffer.clear();
            let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer);
            for (i, (field_id, type_)) in typed_fields_ids.iter() {
                self.value_buffer.clear();
                let value = &record[*i];
                match type_ {
                    AllowedType::Number => {
                        if value.trim().is_empty() {
-                            Value::Null
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
                        } else {
-                            value.trim().parse::<f64>().map(Value::from).map_err(|error| {
+                            match value.trim().parse::<f64>() {
-                                Error::ParseFloat {
+                                Ok(float) => {
-                                    error,
+                                    to_writer(&mut self.value_buffer, &float)?;
                                    // +1 for the header offset.
                                    line: i + 1,
                                    value: value.to_string(),
                                }
-                            })?
+                                Err(error) => {
                                    return Err(Error::ParseFloat {
                                        error,
                                        line,
                                        value: value.to_string(),
                                    });
                                }
                            }
                        }
                    }
                    AllowedType::String => {
                        if value.is_empty() {
-                            Value::Null
+                            to_writer(&mut self.value_buffer, &Value::Null)?;
                        } else {
-                            Value::String(value.to_string())
+                            to_writer(&mut self.value_buffer, value)?;
                        }
                    }
-                };
+                }
-                this.value_buffer.clear();
+                // We insert into the obkv writer the value buffer that has been filled just above.
-                serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
+                writer.insert(*field_id, &self.value_buffer)?;
                writer.insert(*fid, &this.value_buffer)?;
            }
-            this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
+            let internal_id = self.documents_count.to_be_bytes();
-            this.inner.write_all(&this.obkv_buffer)?;
+            let document_bytes = writer.into_inner()?;
-
+            self.writer.insert(internal_id, &document_bytes)?;
-            this.count += 1;
+            self.documents_count += 1;
        }
-        Ok(this)
+        Ok(())
    }
    /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`.
    pub fn into_inner(mut self) -> io::Result<W> {
        let DocumentsBatchBuilder { mut writer, fields_index, .. } = self;
        // We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer.
        self.value_buffer.clear();
        to_writer(&mut self.value_buffer, &fields_index)?;
        writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?;
        writer.into_inner()
    }
 }
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@ -1,24 +1,22 @@
 mod builder;
 /// The documents module defines an intermediary document format that milli uses for indexation, and
 /// provides an API to easily build and read such documents.
 ///
 /// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can
 /// later be read by milli using the `DocumentBatchReader` interface.
 mod reader;
 mod serde_impl;
 use std::fmt::{self, Debug};
 use std::io;
 use bimap::BiHashMap;
-pub use builder::DocumentBatchBuilder;
+pub use builder::DocumentsBatchBuilder;
-pub use reader::DocumentBatchReader;
+pub use reader::{DocumentsBatchCursor, DocumentsBatchReader};
 use serde::{Deserialize, Serialize};
 use crate::FieldId;
 /// The key that is used to store the `DocumentsBatchIndex` datastructure,
 /// it is the absolute last key of the list.
 const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes();
 /// A bidirectional map that links field ids to their name in a document batch.
-#[derive(Default, Debug, Serialize, Deserialize)]
+#[derive(Default, Clone, Debug, Serialize, Deserialize)]
 pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
 impl DocumentsBatchIndex {
@ -46,8 +44,8 @@ impl DocumentsBatchIndex {
        self.0.iter()
    }
-    pub fn name(&self, id: FieldId) -> Option<&String> {
+    pub fn name(&self, id: FieldId) -> Option<&str> {
-        self.0.get_by_left(&id)
+        self.0.get_by_left(&id).map(AsRef::as_ref)
    }
    pub fn recreate_json(
@ -69,50 +67,20 @@ impl DocumentsBatchIndex {
    }
 }
 #[derive(Debug, Serialize, Deserialize)]
 struct DocumentsMetadata {
    count: usize,
    index: DocumentsBatchIndex,
 }
 pub struct ByteCounter<W> {
    count: usize,
    writer: W,
 }
 impl<W> ByteCounter<W> {
    fn new(writer: W) -> Self {
        Self { count: 0, writer }
    }
 }
 impl<W: io::Write> io::Write for ByteCounter<W> {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        let count = self.writer.write(buf)?;
        self.count += count;
        Ok(count)
    }
    fn flush(&mut self) -> io::Result<()> {
        self.writer.flush()
    }
 }
 #[derive(Debug)]
 pub enum Error {
    ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
    InvalidDocumentFormat,
-    Custom(String),
+    Csv(csv::Error),
-    JsonError(serde_json::Error),
+    Json(serde_json::Error),
-    CsvError(csv::Error),
+    Serialize(serde_json::Error),
-    Serialize(bincode::Error),
+    Grenad(grenad::Error),
    Io(io::Error),
    DocumentTooLarge,
 }
 impl From<csv::Error> for Error {
    fn from(e: csv::Error) -> Self {
-        Self::CsvError(e)
+        Self::Csv(e)
    }
 }
@ -122,15 +90,15 @@ impl From<io::Error> for Error {
    }
 }
-impl From<bincode::Error> for Error {
+impl From<serde_json::Error> for Error {
-    fn from(other: bincode::Error) -> Self {
+    fn from(other: serde_json::Error) -> Self {
-        Self::Serialize(other)
+        Self::Json(other)
    }
 }
-impl From<serde_json::Error> for Error {
+impl From<grenad::Error> for Error {
-    fn from(other: serde_json::Error) -> Self {
+    fn from(other: grenad::Error) -> Self {
-        Self::JsonError(other)
+        Self::Grenad(other)
    }
 }
@ -140,13 +108,14 @@ impl fmt::Display for Error {
            Error::ParseFloat { error, line, value } => {
                write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
            }
-            Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
+            Error::InvalidDocumentFormat => {
-            Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
+                f.write_str("Invalid document addition format, missing the documents batch index.")
-            Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
+            }
            Error::Io(e) => write!(f, "{}", e),
            Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
            Error::Serialize(e) => write!(f, "{}", e),
-            Error::CsvError(e) => write!(f, "{}", e),
+            Error::Grenad(e) => write!(f, "{}", e),
            Error::Csv(e) => write!(f, "{}", e),
            Error::Json(e) => write!(f, "{}", e),
        }
    }
 }
@ -158,15 +127,25 @@ impl std::error::Error for Error {}
 macro_rules! documents {
    ($data:tt) => {{
        let documents = serde_json::json!($data);
-        let mut writer = std::io::Cursor::new(Vec::new());
+        let documents = match documents {
-        let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
+            object @ serde_json::Value::Object(_) => vec![object],
-        let documents = serde_json::to_vec(&documents).unwrap();
+            serde_json::Value::Array(objects) => objects,
-        builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
+            invalid => {
-        builder.finish().unwrap();
+                panic!("an array of objects must be specified, {:#?} is not an array", invalid)
            }
        };
-        writer.set_position(0);
+        let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new());
        for document in documents {
            let object = match document {
                serde_json::Value::Object(object) => object,
                invalid => panic!("an object must be specified, {:#?} is not an object", invalid),
            };
            builder.append_json_object(&object).unwrap();
        }
-        crate::documents::DocumentBatchReader::from_reader(writer).unwrap()
+        let vector = builder.into_inner().unwrap();
        crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap()
    }};
 }
--- a/milli/src/documents/reader.rs
+++ b/milli/src/documents/reader.rs
@ -1,11 +1,9 @@
 use std::convert::TryInto;
 use std::io;
 use std::io::{BufReader, Read};
 use std::mem::size_of;
 use byteorder::{BigEndian, ReadBytesExt};
 use obkv::KvReader;
-use super::{DocumentsBatchIndex, DocumentsMetadata, Error};
+use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY};
 use crate::FieldId;
 /// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with
@ -13,63 +11,80 @@ use crate::FieldId;
 ///
 /// The documents are returned in the form of `obkv::Reader` where each field is identified with a
 /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index.
-pub struct DocumentBatchReader<R> {
+pub struct DocumentsBatchReader<R> {
-    reader: BufReader<R>,
+    cursor: grenad::ReaderCursor<R>,
-    metadata: DocumentsMetadata,
+    fields_index: DocumentsBatchIndex,
    buffer: Vec<u8>,
    seen_documents: usize,
 }
-impl<R: io::Read + io::Seek> DocumentBatchReader<R> {
+impl<R: io::Read + io::Seek> DocumentsBatchReader<R> {
    /// Construct a `DocumentsReader` from a reader.
    ///
-    /// It first retrieves the index, then moves to the first document. Subsequent calls to
+    /// It first retrieves the index, then moves to the first document. Use the `into_cursor`
-    /// `next_document` advance the document reader until all the documents have been read.
+    /// method to iterator over the documents, from the first to the last.
-    pub fn from_reader(mut reader: R) -> Result<Self, Error> {
+    pub fn from_reader(reader: R) -> Result<Self, Error> {
-        let mut buffer = Vec::new();
+        let reader = grenad::Reader::new(reader)?;
        let mut cursor = reader.into_cursor()?;
-        let meta_offset = reader.read_u64::<BigEndian>()?;
+        let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? {
-        reader.seek(io::SeekFrom::Start(meta_offset))?;
+            Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?,
-        reader.read_to_end(&mut buffer)?;
+            None => return Err(Error::InvalidDocumentFormat),
-        let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?;
+        };
-        reader.seek(io::SeekFrom::Start(size_of::<u64>() as u64))?;
+        Ok(DocumentsBatchReader { cursor, fields_index })
        buffer.clear();
        let reader = BufReader::new(reader);
        Ok(Self { reader, metadata, buffer, seen_documents: 0 })
    }
-    /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a
+    pub fn documents_count(&self) -> u32 {
-    /// reference to the addition index.
+        self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents")
    pub fn next_document_with_index<'a>(
        &'a mut self,
    ) -> io::Result<Option<(&'a DocumentsBatchIndex, KvReader<'a, FieldId>)>> {
        if self.seen_documents < self.metadata.count {
            let doc_len = self.reader.read_u32::<BigEndian>()?;
            self.buffer.resize(doc_len as usize, 0);
            self.reader.read_exact(&mut self.buffer)?;
            self.seen_documents += 1;
            let reader = KvReader::new(&self.buffer);
            Ok(Some((&self.metadata.index, reader)))
        } else {
            Ok(None)
        }
    }
    /// Return the fields index for the documents batch.
    pub fn index(&self) -> &DocumentsBatchIndex {
        &self.metadata.index
    }
    /// Returns the number of documents in the reader.
    pub fn len(&self) -> usize {
        self.metadata.count
    }
    pub fn is_empty(&self) -> bool {
-        self.len() == 0
+        self.cursor.len().saturating_sub(1) == 0
    }
    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
        &self.fields_index
    }
    /// This method returns a forward cursor over the documents.
    pub fn into_cursor(self) -> DocumentsBatchCursor<R> {
        let DocumentsBatchReader { cursor, fields_index } = self;
        let mut cursor = DocumentsBatchCursor { cursor, fields_index };
        cursor.reset();
        cursor
    }
 }
 /// A forward cursor over the documents in a `DocumentsBatchReader`.
 pub struct DocumentsBatchCursor<R> {
    cursor: grenad::ReaderCursor<R>,
    fields_index: DocumentsBatchIndex,
 }
 impl<R> DocumentsBatchCursor<R> {
    pub fn into_reader(self) -> DocumentsBatchReader<R> {
        let DocumentsBatchCursor { cursor, fields_index, .. } = self;
        DocumentsBatchReader { cursor, fields_index }
    }
    pub fn documents_batch_index(&self) -> &DocumentsBatchIndex {
        &self.fields_index
    }
    /// Resets the cursor to be able to read from the start again.
    pub fn reset(&mut self) {
        self.cursor.reset();
    }
 }
 impl<R: io::Read + io::Seek> DocumentsBatchCursor<R> {
    /// Returns the next document, starting from the first one. Subsequent calls to
    /// `next_document` advance the document reader until all the documents have been read.
    pub fn next_document(&mut self) -> Result<Option<KvReader<FieldId>>, grenad::Error> {
        match self.cursor.move_on_next()? {
            Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => {
                Ok(Some(KvReader::new(value)))
            }
            _otherwise => Ok(None),
        }
    }
 }
--- a/milli/src/documents/serde_impl.rs
+++ b/milli/src/documents/serde_impl.rs
@ -1,134 +0,0 @@
 use std::collections::BTreeMap;
 use std::fmt;
 use std::io::{Cursor, Write};
 use byteorder::WriteBytesExt;
 use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
 use serde::Deserialize;
 use serde_json::Value;
 use super::{ByteCounter, DocumentsBatchIndex, Error};
 use crate::FieldId;
 macro_rules! tri {
    ($e:expr) => {
        match $e {
            Ok(r) => r,
            Err(e) => return Ok(Err(e.into())),
        }
    };
 }
 struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);
 impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
    type Value = FieldId;
    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        deserializer.deserialize_str(self)
    }
 }
 impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
    type Value = FieldId;
    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
    where
        E: serde::de::Error,
    {
        Ok(self.0.insert(v))
    }
    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "a string")
    }
 }
 struct ValueDeserializer;
 impl<'de> DeserializeSeed<'de> for ValueDeserializer {
    type Value = serde_json::Value;
    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        serde_json::Value::deserialize(deserializer)
    }
 }
 pub struct DocumentVisitor<'a, W> {
    pub inner: &'a mut ByteCounter<W>,
    pub index: &'a mut DocumentsBatchIndex,
    pub obkv_buffer: &'a mut Vec<u8>,
    pub value_buffer: &'a mut Vec<u8>,
    pub values: &'a mut BTreeMap<FieldId, Value>,
    pub count: &'a mut usize,
 }
 impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
    /// This Visitor value is nothing, since it write the value to a file.
    type Value = Result<(), Error>;
    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
    where
        A: SeqAccess<'de>,
    {
        while let Some(v) = seq.next_element_seed(&mut *self)? {
            tri!(v)
        }
        Ok(Ok(()))
    }
    fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
    where
        A: MapAccess<'de>,
    {
        while let Some((key, value)) =
            map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
        {
            self.values.insert(key, value);
        }
        self.obkv_buffer.clear();
        let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer));
        for (key, value) in self.values.iter() {
            self.value_buffer.clear();
            // This is guaranteed to work
            tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value));
            tri!(obkv.insert(*key, &self.value_buffer));
        }
        let reader = tri!(obkv.into_inner()).into_inner();
        tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32));
        tri!(self.inner.write_all(reader));
        *self.count += 1;
        self.values.clear();
        Ok(Ok(()))
    }
    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
        write!(f, "a documents, or a sequence of documents.")
    }
 }
 impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
 where
    W: Write,
 {
    type Value = Result<(), Error>;
    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        deserializer.deserialize_map(self)
    }
 }