From f8fe9316c08b924086a700a83616b8fa619707f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= <clementine@meilisearch.com>
Date: Thu, 21 Oct 2021 11:56:14 +0200
Subject: [PATCH 01/22] Update version for the next release (v0.18.1)

---
 helpers/Cargo.toml | 2 +-
 http-ui/Cargo.toml | 2 +-
 infos/Cargo.toml   | 2 +-
 milli/Cargo.toml   | 2 +-
 search/Cargo.toml  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml
index 2f3c26a7b..b3a15f100 100644
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.18.0"
+version = "0.18.1"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 83d5f67c4..a7af6fb9b 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.18.0"
+version = "0.18.1"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/infos/Cargo.toml b/infos/Cargo.toml
index b0220993f..2701c36d7 100644
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.18.0"
+version = "0.18.1"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 594cc60e0..01309797d 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.18.0"
+version = "0.18.1"
 authors = ["Kerollmops <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/search/Cargo.toml b/search/Cargo.toml
index b55c4b8d6..dd74ad7b6 100644
--- a/search/Cargo.toml
+++ b/search/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "search"
-version = "0.18.0"
+version = "0.18.1"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 

From 8d70b01714acdca6e27dccbf08e6c42829973ccd Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Wed, 20 Oct 2021 21:26:52 +0200
Subject: [PATCH 02/22] optimize document deserialization

---
 milli/src/documents/builder.rs          | 142 ++++++-
 milli/src/documents/mod.rs              |  20 +-
 milli/src/documents/serde.rs            | 532 +++++-------------------
 milli/src/search/distinct/mod.rs        |   3 +-
 milli/src/update/index_documents/mod.rs |   3 +-
 milli/tests/search/mod.rs               |   7 +-
 milli/tests/search/query_criteria.rs    |   3 +-
 7 files changed, 241 insertions(+), 469 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index ba1319eff..98213edd7 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -1,9 +1,13 @@
+use std::collections::BTreeMap;
 use std::io;
 
 use byteorder::{BigEndian, WriteBytesExt};
-use serde::ser::Serialize;
+use serde::Deserializer;
+use serde_json::Value;
 
-use super::serde::DocumentSerializer;
+use crate::FieldId;
+
+use super::serde::DocumentVisitor;
 use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 
 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
@@ -24,7 +28,12 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 /// builder.finish().unwrap();
 /// ```
 pub struct DocumentBatchBuilder<W> {
-    serializer: DocumentSerializer<W>,
+    inner: ByteCounter<W>,
+    index: DocumentsBatchIndex,
+    obkv_buffer: Vec<u8>,
+    value_buffer: Vec<u8>,
+    values: BTreeMap<FieldId, Value>,
+    count: usize,
 }
 
 impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
@@ -34,27 +43,33 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
         // add space to write the offset of the metadata at the end of the writer
         writer.write_u64::<BigEndian>(0)?;
 
-        let serializer =
-            DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true };
+        let this = Self {
+            inner: writer,
+            index,
+            obkv_buffer: Vec::new(),
+            value_buffer: Vec::new(),
+            values: BTreeMap::new(),
+            count: 0,
+        };
 
-        Ok(Self { serializer })
+        Ok(this)
     }
 
     /// Returns the number of documents that have been written to the builder.
     pub fn len(&self) -> usize {
-        self.serializer.count
+        self.count
     }
 
     /// This method must be called after the document addition is terminated. It will put the
     /// metadata at the end of the file, and write the metadata offset at the beginning on the
     /// file.
     pub fn finish(self) -> Result<(), Error> {
-        let DocumentSerializer {
-            writer: ByteCounter { mut writer, count: offset },
+        let Self {
+            inner: ByteCounter { mut writer, count: offset },
             index,
             count,
             ..
-        } = self.serializer;
+        } = self;
 
         let meta = DocumentsMetadata { count, index };
 
@@ -68,13 +83,106 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
         Ok(())
     }
 
-    /// Adds documents to the builder.
-    ///
-    /// The internal index is updated with the fields found
-    /// in the documents. Document must either be a map or a sequences of map, anything else will
-    /// fail.
-    pub fn add_documents<T: Serialize>(&mut self, document: T) -> Result<(), Error> {
-        document.serialize(&mut self.serializer)?;
+
+    /// Extends the builder with json documents from a reader.
+    pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
+        let mut de = serde_json::Deserializer::from_reader(reader);
+
+        let mut visitor = DocumentVisitor {
+            inner: &mut self.inner,
+            index: &mut self.index,
+            obkv_buffer: &mut self.obkv_buffer,
+            value_buffer: &mut self.value_buffer,
+            values: &mut self.values,
+            count: &mut self.count,
+        };
+
+        de.deserialize_any(&mut visitor).unwrap();
+
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod test {
+    use std::io::Cursor;
+
+    use crate::documents::DocumentBatchReader;
+
+    use super::*;
+
+    #[test]
+    fn add_single_documents_json() {
+        let mut cursor = Cursor::new(Vec::new());
+        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+
+        let json = serde_json::json!({
+            "id": 1,
+            "field": "hello!",
+        });
+
+        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+
+        let json = serde_json::json!({
+            "blabla": false,
+            "field": "hello!",
+            "id": 1,
+        });
+
+        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+
+        assert_eq!(builder.len(), 2);
+
+        builder.finish().unwrap();
+
+        cursor.set_position(0);
+
+        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 2);
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 3);
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
+
+    #[test]
+    fn add_documents_seq_json() {
+        let mut cursor = Cursor::new(Vec::new());
+        let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+
+        let json = serde_json::json!([{
+            "id": 1,
+            "field": "hello!",
+        },{
+            "blabla": false,
+            "field": "hello!",
+            "id": 1,
+        }
+        ]);
+
+        builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap();
+
+        assert_eq!(builder.len(), 2);
+
+        builder.finish().unwrap();
+
+        cursor.set_position(0);
+
+        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 2);
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 3);
+        assert_eq!(document.iter().count(), 3);
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
+}
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index f79c210fe..ce0539c24 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -92,7 +92,8 @@ macro_rules! documents {
         let documents = serde_json::json!($data);
         let mut writer = std::io::Cursor::new(Vec::new());
         let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap();
-        builder.add_documents(documents).unwrap();
+        let documents = serde_json::to_vec(&documents).unwrap();
+        builder.extend_from_json(std::io::Cursor::new(documents)).unwrap();
         builder.finish().unwrap();
 
         writer.set_position(0);
@@ -124,7 +125,8 @@ mod test {
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 
-        builder.add_documents(json).unwrap();
+        todo!();
+        //builder.add_documents(json).unwrap();
 
         builder.finish().unwrap();
 
@@ -153,8 +155,9 @@ mod test {
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 
-        builder.add_documents(doc1).unwrap();
-        builder.add_documents(doc2).unwrap();
+        todo!();
+        //builder.add_documents(doc1).unwrap();
+        //builder.add_documents(doc2).unwrap();
 
         builder.finish().unwrap();
 
@@ -182,7 +185,8 @@ mod test {
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 
-        builder.add_documents(docs).unwrap();
+        todo!();
+        //builder.add_documents(docs).unwrap();
 
         builder.finish().unwrap();
 
@@ -210,11 +214,13 @@ mod test {
             { "tata": "hello" },
         ]]);
 
-        assert!(builder.add_documents(docs).is_err());
+        todo!();
+        //assert!(builder.add_documents(docs).is_err());
 
         let docs = json!("hello");
 
-        assert!(builder.add_documents(docs).is_err());
+        todo!();
+        //assert!(builder.add_documents(docs).is_err());
     }
 
     #[test]
diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs
index 036ec246a..0d02fff6c 100644
--- a/milli/src/documents/serde.rs
+++ b/milli/src/documents/serde.rs
@@ -1,474 +1,128 @@
 use std::collections::BTreeMap;
-use std::convert::TryInto;
 use std::io::Cursor;
-use std::{fmt, io};
+use std::io::Write;
+use std::fmt;
 
-use byteorder::{BigEndian, WriteBytesExt};
-use obkv::KvWriter;
-use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer};
+use byteorder::WriteBytesExt;
+use serde::Deserialize;
+use serde::de::DeserializeSeed;
+use serde::de::MapAccess;
+use serde::de::SeqAccess;
+use serde::de::Visitor;
 use serde_json::Value;
 
-use super::{ByteCounter, DocumentsBatchIndex, Error};
+use super::{ByteCounter, DocumentsBatchIndex};
 use crate::FieldId;
 
-pub struct DocumentSerializer<W> {
-    pub writer: ByteCounter<W>,
-    pub buffer: Vec<u8>,
-    pub index: DocumentsBatchIndex,
-    pub count: usize,
-    pub allow_seq: bool,
-}
+struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);
 
-impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer<W> {
-    type Ok = ();
+impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
+    type Value = FieldId;
 
-    type Error = Error;
-
-    type SerializeSeq = SeqSerializer<'a, W>;
-    type SerializeTuple = Impossible<(), Self::Error>;
-    type SerializeTupleStruct = Impossible<(), Self::Error>;
-    type SerializeTupleVariant = Impossible<(), Self::Error>;
-    type SerializeMap = MapSerializer<'a, &'a mut ByteCounter<W>>;
-    type SerializeStruct = Impossible<(), Self::Error>;
-    type SerializeStructVariant = Impossible<(), Self::Error>;
-    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
-        self.buffer.clear();
-        let cursor = io::Cursor::new(&mut self.buffer);
-        self.count += 1;
-        let map_serializer = MapSerializer {
-            map: KvWriter::new(cursor),
-            index: &mut self.index,
-            writer: &mut self.writer,
-            mapped_documents: BTreeMap::new(),
-        };
-
-        Ok(map_serializer)
-    }
-
-    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
-        if self.allow_seq {
-            // Only allow sequence of documents of depth 1.
-            self.allow_seq = false;
-            Ok(SeqSerializer { serializer: self })
-        } else {
-            Err(Error::InvalidDocumentFormat)
-        }
-    }
-
-    fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
     where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-    ) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_struct<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_variant<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStructVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
+        D: serde::Deserializer<'de> {
+            deserializer.deserialize_str(self)
     }
 }
 
-pub struct SeqSerializer<'a, W> {
-    serializer: &'a mut DocumentSerializer<W>,
+impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
+    type Value = FieldId;
+
+    fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+    where
+        E: serde::de::Error, {
+            let field_id = match self.0.get_by_right(v) {
+                Some(field_id) => *field_id,
+                None => {
+                    let field_id = self.0.len() as FieldId;
+                    self.0.insert(field_id, v.to_string());
+                    field_id
+                }
+            };
+
+            Ok(field_id)
+    }
+
+    fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result {
+        todo!()
+    }
 }
 
-impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> {
-    type Ok = ();
-    type Error = Error;
+struct ValueDeserializer;
 
-    fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
+impl<'de> DeserializeSeed<'de> for ValueDeserializer {
+    type Value = serde_json::Value;
+
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
     where
-        T: Serialize,
+        D: serde::Deserializer<'de> {
+            serde_json::Value::deserialize(deserializer)
+    }
+}
+
+pub struct DocumentVisitor<'a, W> {
+    pub inner: &'a mut ByteCounter<W>,
+    pub index: &'a mut DocumentsBatchIndex,
+    pub obkv_buffer: &'a mut Vec<u8>,
+    pub value_buffer: &'a mut Vec<u8>,
+    pub values: &'a mut BTreeMap<FieldId, Value>,
+    pub count: &'a mut usize,
+}
+
+impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
+    /// This Visitor value is nothing, since it write the value to a file.
+    type Value = ();
+
+    fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
+    where
+        A: SeqAccess<'de>,
     {
-        value.serialize(&mut *self.serializer)?;
+        while let Some(_) = seq.next_element_seed(&mut *self)? { }
+
         Ok(())
     }
 
-    fn end(self) -> Result<Self::Ok, Self::Error> {
-        Ok(())
-    }
-}
-
-pub struct MapSerializer<'a, W> {
-    map: KvWriter<io::Cursor<&'a mut Vec<u8>>, FieldId>,
-    index: &'a mut DocumentsBatchIndex,
-    writer: W,
-    mapped_documents: BTreeMap<FieldId, Value>,
-}
-
-/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and
-/// serialize_value, therefore these to methods remain unimplemented.
-impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> {
-    type Ok = ();
-    type Error = Error;
-
-    fn serialize_key<T: ?Sized + Serialize>(&mut self, _key: &T) -> Result<(), Self::Error> {
-        unreachable!()
-    }
-
-    fn serialize_value<T: ?Sized>(&mut self, _value: &T) -> Result<(), Self::Error> {
-        unreachable!()
-    }
-
-    fn end(mut self) -> Result<Self::Ok, Self::Error> {
-        let mut buf = Vec::new();
-        for (key, value) in self.mapped_documents {
-            buf.clear();
-            let mut cursor = Cursor::new(&mut buf);
-            serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?;
-            self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?;
+    fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
+    where
+        A: MapAccess<'de>,
+    {
+        while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer).unwrap() {
+            self.values.insert(key, value);
         }
 
-        let data = self.map.into_inner().map_err(Error::Io)?.into_inner();
-        let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?;
+        self.obkv_buffer.clear();
+        let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer));
+        for (key, value) in self.values.iter() {
+            self.value_buffer.clear();
+            // This is guaranteed to work
+            serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value).unwrap();
+            obkv.insert(*key, &self.value_buffer).unwrap();
+        }
 
-        self.writer.write_u32::<BigEndian>(data_len).map_err(Error::Io)?;
-        self.writer.write_all(&data).map_err(Error::Io)?;
+        let reader = obkv.into_inner().unwrap().into_inner();
+
+        self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32).unwrap();
+        self.inner.write_all(reader).unwrap();
+
+        *self.count += 1;
 
         Ok(())
     }
 
-    fn serialize_entry<K: ?Sized, V: ?Sized>(
-        &mut self,
-        key: &K,
-        value: &V,
-    ) -> Result<(), Self::Error>
-    where
-        K: Serialize,
-        V: Serialize,
-    {
-        let field_serializer = FieldSerializer { index: &mut self.index };
-        let field_id: FieldId = key.serialize(field_serializer)?;
-
-        let value = serde_json::to_value(value).map_err(Error::JsonError)?;
-
-        self.mapped_documents.insert(field_id, value);
-
-        Ok(())
+    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "a documents, or a sequence of documents.")
     }
 }
 
-struct FieldSerializer<'a> {
-    index: &'a mut DocumentsBatchIndex,
-}
+impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
+where W: Write,
+{
+    type Value = ();
 
-impl<'a> serde::Serializer for FieldSerializer<'a> {
-    type Ok = FieldId;
-
-    type Error = Error;
-
-    type SerializeSeq = Impossible<FieldId, Self::Error>;
-    type SerializeTuple = Impossible<FieldId, Self::Error>;
-    type SerializeTupleStruct = Impossible<FieldId, Self::Error>;
-    type SerializeTupleVariant = Impossible<FieldId, Self::Error>;
-    type SerializeMap = Impossible<FieldId, Self::Error>;
-    type SerializeStruct = Impossible<FieldId, Self::Error>;
-    type SerializeStructVariant = Impossible<FieldId, Self::Error>;
-
-    fn serialize_str(self, ws: &str) -> Result<Self::Ok, Self::Error> {
-        let field_id = match self.index.get_by_right(ws) {
-            Some(field_id) => *field_id,
-            None => {
-                let field_id = self.index.len() as FieldId;
-                self.index.insert(field_id, ws.to_string());
-                field_id
-            }
-        };
-
-        Ok(field_id)
-    }
-
-    fn serialize_bool(self, _v: bool) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i8(self, _v: i8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i16(self, _v: i16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i32(self, _v: i32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_i64(self, _v: i64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u8(self, _v: u8) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u16(self, _v: u16) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u32(self, _v: u32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_u64(self, _v: u64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f32(self, _v: f32) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_f64(self, _v: f64) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_char(self, _v: char) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
+    fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
     where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_unit_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-    ) -> Result<Self::Ok, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_struct<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_newtype_variant<T: ?Sized>(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _value: &T,
-    ) -> Result<Self::Ok, Self::Error>
-    where
-        T: Serialize,
-    {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_tuple_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeTupleVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct(
-        self,
-        _name: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStruct, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-
-    fn serialize_struct_variant(
-        self,
-        _name: &'static str,
-        _variant_index: u32,
-        _variant: &'static str,
-        _len: usize,
-    ) -> Result<Self::SerializeStructVariant, Self::Error> {
-        Err(Error::InvalidDocumentFormat)
-    }
-}
-
-impl serde::ser::Error for Error {
-    fn custom<T: fmt::Display>(msg: T) -> Self {
-        Error::Custom(msg.to_string())
+        D: serde::Deserializer<'de> {
+            deserializer.deserialize_map(self)
     }
 }
diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs
index deb51a053..5639c53fa 100644
--- a/milli/src/search/distinct/mod.rs
+++ b/milli/src/search/distinct/mod.rs
@@ -68,7 +68,8 @@ mod test {
                 "txts": sample_txts[..(rng.gen_range(0..3))],
                 "cat-ints": sample_ints[..(rng.gen_range(0..3))],
             });
-            builder.add_documents(doc).unwrap();
+            todo!()
+            //builder.add_documents(doc).unwrap();
         }
 
         builder.finish().unwrap();
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 92bcab0e9..58e21a615 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -877,7 +877,8 @@ mod tests {
         let mut cursor = Cursor::new(Vec::new());
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(big_object).unwrap();
+        todo!();
+        //builder.add_documents(big_object).unwrap();
         builder.finish().unwrap();
         cursor.set_position(0);
         let content = DocumentBatchReader::from_reader(cursor).unwrap();
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index cda0da617..d62b8ec31 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -61,9 +61,10 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
     let mut cursor = Cursor::new(Vec::new());
     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
     let reader = Cursor::new(CONTENT.as_bytes());
-    for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
-        documents_builder.add_documents(doc.unwrap()).unwrap();
-    }
+    todo!();
+    //for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
+        //documents_builder.add_documents(doc.unwrap()).unwrap();
+    //}
     documents_builder.finish().unwrap();
 
     cursor.set_position(0);
diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs
index f3b04c4fa..3fb36b1d5 100644
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@@ -409,7 +409,8 @@ fn criteria_ascdesc() {
             "age": age,
         });
 
-        batch_builder.add_documents(json).unwrap();
+        todo!();
+        //batch_builder.add_documents(json).unwrap();
     });
 
     batch_builder.finish().unwrap();

From 0f86d6b28f333b2f75439ef3483b0f8f2f59c824 Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Thu, 21 Oct 2021 11:05:16 +0200
Subject: [PATCH 03/22] implement csv serialization

---
 milli/Cargo.toml                              |   1 +
 milli/src/documents/builder.rs                | 103 +++++++++++++++++-
 milli/src/documents/mod.rs                    |  33 +++++-
 milli/src/documents/serde.rs                  |  14 +--
 milli/src/update/index_documents/transform.rs |   8 +-
 5 files changed, 142 insertions(+), 17 deletions(-)

diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 594cc60e0..709f8d865 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -47,6 +47,7 @@ itertools = "0.10.0"
 # logging
 log = "0.4.14"
 logging_timer = "1.0.0"
+csv = "1.1.6"
 
 [dev-dependencies]
 big_s = "1.0.2"
diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 98213edd7..719580b4a 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -1,5 +1,8 @@
 use std::collections::BTreeMap;
+use std::collections::HashMap;
 use std::io;
+use std::io::Cursor;
+use std::io::Write;
 
 use byteorder::{BigEndian, WriteBytesExt};
 use serde::Deserializer;
@@ -38,7 +41,7 @@ pub struct DocumentBatchBuilder<W> {
 
 impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
     pub fn new(writer: W) -> Result<Self, Error> {
-        let index = DocumentsBatchIndex::new();
+        let index = DocumentsBatchIndex::default();
         let mut writer = ByteCounter::new(writer);
         // add space to write the offset of the metadata at the end of the writer
         writer.write_u64::<BigEndian>(0)?;
@@ -101,6 +104,79 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
 
         Ok(())
     }
+
+    /// Extends the builder with json documents from a reader.
+    ///
+    /// This method can be only called once and is mutually exclusive with extend from json. This
+    /// is because the fields in a csv are always guaranteed to come in order, and permits some
+    /// optimizations.
+    ///
+    /// From csv takes care to call finish in the end.
+    pub fn from_csv<R: io::Read>(mut self, reader: R) -> Result<(), Error> {
+
+        // Ensure that this is the first and only addition made with this builder
+        debug_assert!(self.index.is_empty());
+
+        let mut records = csv::Reader::from_reader(reader);
+
+        let headers = records
+            .headers()
+            .unwrap()
+            .into_iter()
+            .map(parse_csv_header)
+            .map(|(k, t)| (self.index.insert(&k), t))
+            .collect::<HashMap<_, _>>();
+
+        let records = records.into_records();
+
+        dbg!(&headers);
+        for record in records {
+            match record {
+                Ok(record) => {
+                    let mut writer = obkv::KvWriter::new(Cursor::new(&mut self.obkv_buffer));
+                    for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
+                        let value = match ty {
+                            AllowedType::Number => value.parse::<f64>().map(Value::from).unwrap(),
+                            AllowedType::String => Value::String(value.to_string()),
+                        };
+
+                        serde_json::to_writer(Cursor::new(&mut self.value_buffer), dbg!(&value)).unwrap();
+                        writer.insert(*fid, &self.value_buffer)?;
+                        self.value_buffer.clear();
+                    }
+
+                    self.inner.write_u32::<BigEndian>(self.obkv_buffer.len() as u32)?;
+                    self.inner.write_all(&self.obkv_buffer)?;
+
+                    self.obkv_buffer.clear();
+                    self.count += 1;
+                },
+                Err(_) => panic!(),
+            }
+        }
+
+        self.finish()?;
+
+        Ok(())
+    }
+}
+
+#[derive(Debug)]
+enum AllowedType {
+    String,
+    Number,
+}
+
+fn parse_csv_header(header: &str) -> (String, AllowedType) {
+    // if there are several separators we only split on the last one.
+    match header.rsplit_once(':') {
+        Some((field_name, field_type)) => match field_type {
+            "string" => (field_name.to_string(), AllowedType::String),
+            "number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field.
+            _otherwise => (header.to_string(), AllowedType::String),
+        },
+        None => (header.to_string(), AllowedType::String),
+    }
 }
 
 #[cfg(test)]
@@ -185,4 +261,29 @@ mod test {
 
         assert!(reader.next_document_with_index().unwrap().is_none());
     }
+
+    #[test]
+    fn add_documents_csv() {
+        let mut cursor = Cursor::new(Vec::new());
+        let builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
+
+        let csv = "id:number,field:string\n1,hello!\n2,blabla";
+
+        builder.from_csv(Cursor::new(csv.as_bytes())).unwrap();
+
+        cursor.set_position(0);
+
+        let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
+
+        dbg!(reader.len());
+
+        let (index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(index.len(), 2);
+        assert_eq!(document.iter().count(), 2);
+
+        let (_index, document) = reader.next_document_with_index().unwrap().unwrap();
+        assert_eq!(document.iter().count(), 2);
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
 }
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index ce0539c24..9f6ebd3de 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -17,7 +17,38 @@ pub use reader::DocumentBatchReader;
 use crate::FieldId;
 
 /// A bidirectional map that links field ids to their name in a document batch.
-pub type DocumentsBatchIndex = BiHashMap<FieldId, String>;
+#[derive(Default, Debug, Serialize, Deserialize)]
+pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
+
+impl DocumentsBatchIndex {
+    /// Insert the field in the map, or return it's field id if it doesn't already exists.
+    pub fn insert(&mut self, field:  &str) -> FieldId {
+        match self.0.get_by_right(field) {
+            Some(field_id) => *field_id,
+            None => {
+                let field_id = self.0.len() as FieldId;
+                self.0.insert(field_id, field.to_string());
+                field_id
+            }
+        }
+    }
+
+    pub fn is_empty(&self) -> bool {
+        self.0.is_empty()
+    }
+
+    pub fn len(&self) -> usize {
+        self.0.len()
+    }
+
+    pub fn iter(&self) -> impl Iterator<Item=(&FieldId, &String)> {
+        self.0.iter()
+    }
+
+    pub fn get_id(&self, id: FieldId) -> Option<&String> {
+        self.0.get_by_left(&id)
+    }
+}
 
 #[derive(Debug, Serialize, Deserialize)]
 struct DocumentsMetadata {
diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs
index 0d02fff6c..86fb68534 100644
--- a/milli/src/documents/serde.rs
+++ b/milli/src/documents/serde.rs
@@ -31,17 +31,9 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
 
     fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
     where
-        E: serde::de::Error, {
-            let field_id = match self.0.get_by_right(v) {
-                Some(field_id) => *field_id,
-                None => {
-                    let field_id = self.0.len() as FieldId;
-                    self.0.insert(field_id, v.to_string());
-                    field_id
-                }
-            };
-
-            Ok(field_id)
+        E: serde::de::Error,
+    {
+            Ok(self.0.insert(v))
     }
 
     fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result {
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index c0c88abed..5af1eda72 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -75,7 +75,7 @@ fn create_fields_mapping(
         .collect()
 }
 
-fn find_primary_key(index: &bimap::BiHashMap<u16, String>) -> Option<&str> {
+fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> {
     index
         .iter()
         .sorted_by_key(|(k, _)| *k)
@@ -179,7 +179,7 @@ impl Transform<'_, '_> {
                         if !self.autogenerate_docids {
                             let mut json = Map::new();
                             for (key, value) in document.iter() {
-                                let key = addition_index.get_by_left(&key).cloned();
+                                let key = addition_index.get_id(key).cloned();
                                 let value = serde_json::from_slice::<Value>(&value).ok();
 
                                 if let Some((k, v)) = key.zip(value) {
@@ -544,7 +544,7 @@ mod test {
     mod primary_key_inference {
         use bimap::BiHashMap;
 
-        use crate::update::index_documents::transform::find_primary_key;
+        use crate::{documents::DocumentsBatchIndex, update::index_documents::transform::find_primary_key};
 
         #[test]
         fn primary_key_infered_on_first_field() {
@@ -557,7 +557,7 @@ mod test {
                 map.insert(4, "fakeId".to_string());
                 map.insert(0, "realId".to_string());
 
-                assert_eq!(find_primary_key(&map), Some("realId"));
+                assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId"));
             }
         }
     }

From 2e62925a6e5f6ee864410322758b32c39a8f1a38 Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Sun, 24 Oct 2021 14:41:36 +0200
Subject: [PATCH 04/22] fix tests

---
 milli/src/documents/builder.rs          | 51 +++++++++----------------
 milli/src/documents/mod.rs              | 28 ++++++++------
 milli/src/documents/serde.rs            |  1 +
 milli/src/index.rs                      |  1 +
 milli/src/search/distinct/mod.rs        |  5 ++-
 milli/src/update/index_documents/mod.rs |  4 +-
 milli/tests/search/mod.rs               | 10 +++--
 milli/tests/search/query_criteria.rs    |  4 +-
 8 files changed, 49 insertions(+), 55 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 719580b4a..8c70910b5 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -1,5 +1,4 @@
 use std::collections::BTreeMap;
-use std::collections::HashMap;
 use std::io;
 use std::io::Cursor;
 use std::io::Write;
@@ -18,18 +17,6 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 ///
 /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
 /// iterate other the documents.
-///
-/// ## example:
-/// ```
-/// use milli::documents::DocumentBatchBuilder;
-/// use serde_json::json;
-/// use std::io::Cursor;
-///
-/// let mut writer = Cursor::new(Vec::new());
-/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
-/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap();
-/// builder.finish().unwrap();
-/// ```
 pub struct DocumentBatchBuilder<W> {
     inner: ByteCounter<W>,
     index: DocumentsBatchIndex,
@@ -100,7 +87,7 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
             count: &mut self.count,
         };
 
-        de.deserialize_any(&mut visitor).unwrap();
+        de.deserialize_any(&mut visitor).map_err(Error::JsonError)?;
 
         Ok(())
     }
@@ -112,10 +99,11 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
     /// optimizations.
     ///
     /// From csv takes care to call finish in the end.
-    pub fn from_csv<R: io::Read>(mut self, reader: R) -> Result<(), Error> {
+    pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
 
+        let mut this = Self::new(writer)?;
         // Ensure that this is the first and only addition made with this builder
-        debug_assert!(self.index.is_empty());
+        debug_assert!(this.index.is_empty());
 
         let mut records = csv::Reader::from_reader(reader);
 
@@ -124,40 +112,37 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
             .unwrap()
             .into_iter()
             .map(parse_csv_header)
-            .map(|(k, t)| (self.index.insert(&k), t))
-            .collect::<HashMap<_, _>>();
+            .map(|(k, t)| (this.index.insert(&k), t))
+            .collect::<BTreeMap<_, _>>();
 
         let records = records.into_records();
 
-        dbg!(&headers);
         for record in records {
             match record {
                 Ok(record) => {
-                    let mut writer = obkv::KvWriter::new(Cursor::new(&mut self.obkv_buffer));
+                    let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
                     for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
                         let value = match ty {
                             AllowedType::Number => value.parse::<f64>().map(Value::from).unwrap(),
                             AllowedType::String => Value::String(value.to_string()),
                         };
 
-                        serde_json::to_writer(Cursor::new(&mut self.value_buffer), dbg!(&value)).unwrap();
-                        writer.insert(*fid, &self.value_buffer)?;
-                        self.value_buffer.clear();
+                        serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value).unwrap();
+                        writer.insert(*fid, &this.value_buffer)?;
+                        this.value_buffer.clear();
                     }
 
-                    self.inner.write_u32::<BigEndian>(self.obkv_buffer.len() as u32)?;
-                    self.inner.write_all(&self.obkv_buffer)?;
+                    this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
+                    this.inner.write_all(&this.obkv_buffer)?;
 
-                    self.obkv_buffer.clear();
-                    self.count += 1;
+                    this.obkv_buffer.clear();
+                    this.count += 1;
                 },
                 Err(_) => panic!(),
             }
         }
 
-        self.finish()?;
-
-        Ok(())
+        Ok(this)
     }
 }
 
@@ -265,18 +250,16 @@ mod test {
     #[test]
     fn add_documents_csv() {
         let mut cursor = Cursor::new(Vec::new());
-        let builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 
         let csv = "id:number,field:string\n1,hello!\n2,blabla";
 
-        builder.from_csv(Cursor::new(csv.as_bytes())).unwrap();
+        let builder = DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
+        builder.finish().unwrap();
 
         cursor.set_position(0);
 
         let mut reader = DocumentBatchReader::from_reader(cursor).unwrap();
 
-        dbg!(reader.len());
-
         let (index, document) = reader.next_document_with_index().unwrap().unwrap();
         assert_eq!(index.len(), 2);
         assert_eq!(document.iter().count(), 2);
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 9f6ebd3de..8a8b87794 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -135,6 +135,8 @@ macro_rules! documents {
 
 #[cfg(test)]
 mod test {
+    use std::io::Cursor;
+
     use serde_json::{json, Value};
 
     use super::*;
@@ -151,13 +153,14 @@ mod test {
             "bool": true
         });
 
+        let json = serde_json::to_vec(&json).unwrap();
+
         let mut v = Vec::new();
         let mut cursor = io::Cursor::new(&mut v);
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 
-        todo!();
-        //builder.add_documents(json).unwrap();
+        builder.extend_from_json(Cursor::new(json)).unwrap();
 
         builder.finish().unwrap();
 
@@ -181,14 +184,16 @@ mod test {
             "toto": false,
         });
 
+        let doc1 = serde_json::to_vec(&doc1).unwrap();
+        let doc2 = serde_json::to_vec(&doc2).unwrap();
+
         let mut v = Vec::new();
         let mut cursor = io::Cursor::new(&mut v);
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 
-        todo!();
-        //builder.add_documents(doc1).unwrap();
-        //builder.add_documents(doc2).unwrap();
+        builder.extend_from_json(Cursor::new(doc1)).unwrap();
+        builder.extend_from_json(Cursor::new(doc2)).unwrap();
 
         builder.finish().unwrap();
 
@@ -211,13 +216,14 @@ mod test {
             { "tata": "hello" },
         ]);
 
+        let docs = serde_json::to_vec(&docs).unwrap();
+
         let mut v = Vec::new();
         let mut cursor = io::Cursor::new(&mut v);
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
 
-        todo!();
-        //builder.add_documents(docs).unwrap();
+        builder.extend_from_json(Cursor::new(docs)).unwrap();
 
         builder.finish().unwrap();
 
@@ -245,13 +251,13 @@ mod test {
             { "tata": "hello" },
         ]]);
 
-        todo!();
-        //assert!(builder.add_documents(docs).is_err());
+        let docs = serde_json::to_vec(&docs).unwrap();
+        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
 
         let docs = json!("hello");
+        let docs = serde_json::to_vec(&docs).unwrap();
 
-        todo!();
-        //assert!(builder.add_documents(docs).is_err());
+        assert!(builder.extend_from_json(Cursor::new(docs)).is_err());
     }
 
     #[test]
diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs
index 86fb68534..2466ed373 100644
--- a/milli/src/documents/serde.rs
+++ b/milli/src/documents/serde.rs
@@ -98,6 +98,7 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
         self.inner.write_all(reader).unwrap();
 
         *self.count += 1;
+        self.values.clear();
 
         Ok(())
     }
diff --git a/milli/src/index.rs b/milli/src/index.rs
index 6ce693fbe..fe89fe734 100644
--- a/milli/src/index.rs
+++ b/milli/src/index.rs
@@ -953,6 +953,7 @@ pub(crate) mod tests {
             { "id": 1, "name": "kevin", "has_dog": true },
             { "id": 2, "name": "bob" }
         ]);
+
         let mut wtxn = index.write_txn().unwrap();
         let builder = IndexDocuments::new(&mut wtxn, &index, 0);
         builder.execute(content, |_, _| ()).unwrap();
diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs
index 5639c53fa..11f6379e3 100644
--- a/milli/src/search/distinct/mod.rs
+++ b/milli/src/search/distinct/mod.rs
@@ -68,8 +68,9 @@ mod test {
                 "txts": sample_txts[..(rng.gen_range(0..3))],
                 "cat-ints": sample_ints[..(rng.gen_range(0..3))],
             });
-            todo!()
-            //builder.add_documents(doc).unwrap();
+
+            let doc = Cursor::new(serde_json::to_vec(&doc).unwrap());
+            builder.extend_from_json(doc).unwrap();
         }
 
         builder.finish().unwrap();
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 58e21a615..17c778060 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -877,8 +877,8 @@ mod tests {
         let mut cursor = Cursor::new(Vec::new());
 
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        todo!();
-        //builder.add_documents(big_object).unwrap();
+        let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap());
+        builder.extend_from_json(big_object).unwrap();
         builder.finish().unwrap();
         cursor.set_position(0);
         let content = DocumentBatchReader::from_reader(cursor).unwrap();
diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs
index d62b8ec31..e8fb3fdfa 100644
--- a/milli/tests/search/mod.rs
+++ b/milli/tests/search/mod.rs
@@ -61,10 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index {
     let mut cursor = Cursor::new(Vec::new());
     let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
     let reader = Cursor::new(CONTENT.as_bytes());
-    todo!();
-    //for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
-        //documents_builder.add_documents(doc.unwrap()).unwrap();
-    //}
+
+    for doc in serde_json::Deserializer::from_reader(reader).into_iter::<serde_json::Value>() {
+        let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap());
+        documents_builder.extend_from_json(doc).unwrap();
+    }
+
     documents_builder.finish().unwrap();
 
     cursor.set_position(0);
diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs
index 3fb36b1d5..e5dde049c 100644
--- a/milli/tests/search/query_criteria.rs
+++ b/milli/tests/search/query_criteria.rs
@@ -409,8 +409,8 @@ fn criteria_ascdesc() {
             "age": age,
         });
 
-        todo!();
-        //batch_builder.add_documents(json).unwrap();
+        let json = Cursor::new(serde_json::to_vec(&json).unwrap());
+        batch_builder.extend_from_json(json).unwrap();
     });
 
     batch_builder.finish().unwrap();

From 53c79e85f2a0acd4559c11eb01f27914ca1c5ccb Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Sun, 24 Oct 2021 15:39:56 +0200
Subject: [PATCH 05/22] document errors

---
 milli/src/documents/builder.rs | 20 +++++++------------
 milli/src/documents/mod.rs     | 30 +++++++++++++++++++++++++---
 milli/src/documents/serde.rs   | 36 ++++++++++++++++++++++------------
 3 files changed, 57 insertions(+), 29 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 8c70910b5..a7b0c90a4 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -87,18 +87,13 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
             count: &mut self.count,
         };
 
-        de.deserialize_any(&mut visitor).map_err(Error::JsonError)?;
-
-        Ok(())
+        de.deserialize_any(&mut visitor).map_err(Error::JsonError)?
     }
 
-    /// Extends the builder with json documents from a reader.
+    /// Creates a builder from a reader of CSV documents.
     ///
-    /// This method can be only called once and is mutually exclusive with extend from json. This
-    /// is because the fields in a csv are always guaranteed to come in order, and permits some
-    /// optimizations.
-    ///
-    /// From csv takes care to call finish in the end.
+    /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
+    /// optimisations, and extending from another CSV is not allowed.
     pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
 
         let mut this = Self::new(writer)?;
@@ -108,8 +103,7 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
         let mut records = csv::Reader::from_reader(reader);
 
         let headers = records
-            .headers()
-            .unwrap()
+            .headers()?
             .into_iter()
             .map(parse_csv_header)
             .map(|(k, t)| (this.index.insert(&k), t))
@@ -123,11 +117,11 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
                     let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
                     for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
                         let value = match ty {
-                            AllowedType::Number => value.parse::<f64>().map(Value::from).unwrap(),
+                            AllowedType::Number => value.parse::<f64>().map(Value::from)?,
                             AllowedType::String => Value::String(value.to_string()),
                         };
 
-                        serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value).unwrap();
+                        serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
                         writer.insert(*fid, &this.value_buffer)?;
                         this.value_buffer.clear();
                     }
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 8a8b87794..bbc114480 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -7,7 +7,9 @@ mod builder;
 mod reader;
 mod serde;
 
-use std::{fmt, io};
+use std::num::ParseFloatError;
+use std::io;
+use std::fmt::{self, Debug};
 
 use ::serde::{Deserialize, Serialize};
 use bimap::BiHashMap;
@@ -81,14 +83,22 @@ impl<W: io::Write> io::Write for ByteCounter<W> {
 
 #[derive(Debug)]
 pub enum Error {
+    ParseFloat(std::num::ParseFloatError),
     InvalidDocumentFormat,
     Custom(String),
     JsonError(serde_json::Error),
+    CsvError(csv::Error),
     Serialize(bincode::Error),
     Io(io::Error),
     DocumentTooLarge,
 }
 
+impl From<csv::Error> for Error {
+    fn from(e: csv::Error) -> Self {
+        Self::CsvError(e)
+    }
+}
+
 impl From<io::Error> for Error {
     fn from(other: io::Error) -> Self {
         Self::Io(other)
@@ -101,15 +111,29 @@ impl From<bincode::Error> for Error {
     }
 }
 
+impl From<serde_json::Error> for Error {
+    fn from(other: serde_json::Error) -> Self {
+        Self::JsonError(other)
+    }
+}
+
+impl From<ParseFloatError> for Error {
+    fn from(other: ParseFloatError) -> Self {
+        Self::ParseFloat(other)
+    }
+}
+
 impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
+            Error::ParseFloat(e) => write!(f, "{}", e),
             Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
             Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
             Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
-            Error::Io(e) => e.fmt(f),
+            Error::Io(e) => write!(f, "{}", e),
             Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"),
-            Error::Serialize(e) => e.fmt(f),
+            Error::Serialize(e) => write!(f, "{}", e),
+            Error::CsvError(e) => write!(f, "{}", e),
         }
     }
 }
diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs
index 2466ed373..4dfdca2c7 100644
--- a/milli/src/documents/serde.rs
+++ b/milli/src/documents/serde.rs
@@ -11,9 +11,19 @@ use serde::de::SeqAccess;
 use serde::de::Visitor;
 use serde_json::Value;
 
+use super::Error;
 use super::{ByteCounter, DocumentsBatchIndex};
 use crate::FieldId;
 
+macro_rules! tri {
+    ($e:expr) => {
+        match $e {
+            Ok(r) => r,
+            Err(e) => return Ok(Err(e.into())),
+        }
+    };
+}
+
 struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex);
 
 impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
@@ -36,8 +46,8 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
             Ok(self.0.insert(v))
     }
 
-    fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result {
-        todo!()
+    fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        write!(f, "a string")
     }
 }
 
@@ -64,22 +74,22 @@ pub struct DocumentVisitor<'a, W> {
 
 impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
     /// This Visitor value is nothing, since it write the value to a file.
-    type Value = ();
+    type Value = Result<(), Error>;
 
     fn visit_seq<A>(self, mut seq: A) -> Result<Self::Value, A::Error>
     where
         A: SeqAccess<'de>,
     {
-        while let Some(_) = seq.next_element_seed(&mut *self)? { }
+        while let Some(v) = seq.next_element_seed(&mut *self)? { tri!(v) }
 
-        Ok(())
+        Ok(Ok(()))
     }
 
     fn visit_map<A>(self, mut map: A) -> Result<Self::Value, A::Error>
     where
         A: MapAccess<'de>,
     {
-        while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer).unwrap() {
+        while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? {
             self.values.insert(key, value);
         }
 
@@ -88,19 +98,19 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
         for (key, value) in self.values.iter() {
             self.value_buffer.clear();
             // This is guaranteed to work
-            serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value).unwrap();
-            obkv.insert(*key, &self.value_buffer).unwrap();
+            tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value));
+            tri!(obkv.insert(*key, &self.value_buffer));
         }
 
-        let reader = obkv.into_inner().unwrap().into_inner();
+        let reader = tri!(obkv.into_inner()).into_inner();
 
-        self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32).unwrap();
-        self.inner.write_all(reader).unwrap();
+        tri!(self.inner.write_u32::<byteorder::BigEndian>(reader.len() as u32));
+        tri!(self.inner.write_all(reader));
 
         *self.count += 1;
         self.values.clear();
 
-        Ok(())
+        Ok(Ok(()))
     }
 
     fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -111,7 +121,7 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
 impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
 where W: Write,
 {
-    type Value = ();
+    type Value = Result<(), Error>;
 
     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
     where

From 430e9b13d3523a17a5354a7a2f8d955a3097a008 Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Mon, 25 Oct 2021 09:48:53 +0200
Subject: [PATCH 06/22] add csv  builder tests

---
 milli/src/documents/builder.rs                | 324 ++++++++++++++++--
 milli/src/documents/mod.rs                    |   2 +-
 milli/src/update/index_documents/transform.rs |   2 +-
 3 files changed, 297 insertions(+), 31 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index a7b0c90a4..d54d9639c 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -54,12 +54,7 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
     /// metadata at the end of the file, and write the metadata offset at the beginning on the
     /// file.
     pub fn finish(self) -> Result<(), Error> {
-        let Self {
-            inner: ByteCounter { mut writer, count: offset },
-            index,
-            count,
-            ..
-        } = self;
+        let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
 
         let meta = DocumentsMetadata { count, index };
 
@@ -73,7 +68,6 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
         Ok(())
     }
 
-
     /// Extends the builder with json documents from a reader.
     pub fn extend_from_json<R: io::Read>(&mut self, reader: R) -> Result<(), Error> {
         let mut de = serde_json::Deserializer::from_reader(reader);
@@ -95,7 +89,6 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
     /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform
     /// optimisations, and extending from another CSV is not allowed.
     pub fn from_csv<R: io::Read>(reader: R, writer: W) -> Result<Self, Error> {
-
         let mut this = Self::new(writer)?;
         // Ensure that this is the first and only addition made with this builder
         debug_assert!(this.index.is_empty());
@@ -112,28 +105,24 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
         let records = records.into_records();
 
         for record in records {
-            match record {
-                Ok(record) => {
-                    let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
-                    for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
-                        let value = match ty {
-                            AllowedType::Number => value.parse::<f64>().map(Value::from)?,
-                            AllowedType::String => Value::String(value.to_string()),
-                        };
+            let record = record?;
+            let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
+            for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
+                let value = match ty {
+                    AllowedType::Number => value.parse::<f64>().map(Value::from)?,
+                    AllowedType::String => Value::String(value.to_string()),
+                };
 
-                        serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
-                        writer.insert(*fid, &this.value_buffer)?;
-                        this.value_buffer.clear();
-                    }
-
-                    this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
-                    this.inner.write_all(&this.obkv_buffer)?;
-
-                    this.obkv_buffer.clear();
-                    this.count += 1;
-                },
-                Err(_) => panic!(),
+                serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
+                writer.insert(*fid, &this.value_buffer)?;
+                this.value_buffer.clear();
             }
+
+            this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
+            this.inner.write_all(&this.obkv_buffer)?;
+
+            this.obkv_buffer.clear();
+            this.count += 1;
         }
 
         Ok(this)
@@ -162,10 +151,25 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) {
 mod test {
     use std::io::Cursor;
 
+    use serde_json::{json, Map};
+
     use crate::documents::DocumentBatchReader;
 
     use super::*;
 
+    fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
+        let mut map = Map::new();
+
+        for (fid, value) in obkv.iter() {
+            let field_name = index.name(fid).unwrap().clone();
+            let value: Value = serde_json::from_slice(value).unwrap();
+
+            map.insert(field_name, value);
+        }
+
+        Value::Object(map)
+    }
+
     #[test]
     fn add_single_documents_json() {
         let mut cursor = Cursor::new(Vec::new());
@@ -247,7 +251,8 @@ mod test {
 
         let csv = "id:number,field:string\n1,hello!\n2,blabla";
 
-        let builder = DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
+        let builder =
+            DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap();
         builder.finish().unwrap();
 
         cursor.set_position(0);
@@ -263,4 +268,265 @@ mod test {
 
         assert!(reader.next_document_with_index().unwrap().is_none());
     }
+
+    #[test]
+    fn simple_csv_document() {
+        let documents = r#"city,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+
+        assert!(reader.next_document_with_index().unwrap().is_none());
+    }
+
+    #[test]
+    fn coma_in_field() {
+        let documents = r#"city,country,pop
+"Boston","United, States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United, States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn quote_in_field() {
+        let documents = r#"city,country,pop
+"Boston","United"" States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United\" States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn integer_in_field() {
+        let documents = r#"city,country,pop:number
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": 4628910.0,
+            })
+        );
+    }
+
+    #[test]
+    fn float_in_field() {
+        let documents = r#"city,country,pop:number
+"Boston","United States","4628910.01""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city": "Boston",
+                "country": "United States",
+                "pop": 4628910.01,
+            })
+        );
+    }
+
+    #[test]
+    fn several_colon_in_header() {
+        let documents = r#"city:love:string,country:state,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city:love": "Boston",
+                "country:state": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn ending_by_colon_in_header() {
+        let documents = r#"city:,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city:": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn starting_by_colon_in_header() {
+        let documents = r#":city,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                ":city": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[ignore]
+    #[test]
+    fn starting_by_colon_in_header2() {
+        let documents = r#":string,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+
+        assert!(reader.next_document_with_index().is_err());
+    }
+
+    #[test]
+    fn double_colon_in_header() {
+        let documents = r#"city::string,country,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf))
+            .unwrap()
+            .finish()
+            .unwrap();
+        let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap();
+        let (index, doc) = reader.next_document_with_index().unwrap().unwrap();
+        let val = obkv_to_value(&doc, index);
+
+        assert_eq!(
+            val,
+            json!({
+                "city:": "Boston",
+                "country": "United States",
+                "pop": "4628910",
+            })
+        );
+    }
+
+    #[test]
+    fn bad_type_in_header() {
+        let documents = r#"city,country:number,pop
+"Boston","United States","4628910""#;
+
+        let mut buf = Vec::new();
+        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
+    }
+
+    #[test]
+    fn bad_column_count1() {
+        let documents = r#"city,country,pop
+"Boston","United States","4628910", "too much""#;
+
+        let mut buf = Vec::new();
+        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
+    }
+
+    #[test]
+    fn bad_column_count2() {
+        let documents = r#"city,country,pop
+"Boston","United States""#;
+
+        let mut buf = Vec::new();
+        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
+    }
 }
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index bbc114480..b61a3326b 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -47,7 +47,7 @@ impl DocumentsBatchIndex {
         self.0.iter()
     }
 
-    pub fn get_id(&self, id: FieldId) -> Option<&String> {
+    pub fn name(&self, id: FieldId) -> Option<&String> {
         self.0.get_by_left(&id)
     }
 }
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 5af1eda72..eac60effb 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -179,7 +179,7 @@ impl Transform<'_, '_> {
                         if !self.autogenerate_docids {
                             let mut json = Map::new();
                             for (key, value) in document.iter() {
-                                let key = addition_index.get_id(key).cloned();
+                                let key = addition_index.name(key).cloned();
                                 let value = serde_json::from_slice::<Value>(&value).ok();
 
                                 if let Some((k, v)) = key.zip(value) {

From 3fcccc31b5885029499056adaaf42853ab45d1ce Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Mon, 25 Oct 2021 10:26:28 +0200
Subject: [PATCH 07/22] add document builder example

---
 milli/src/documents/builder.rs | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index d54d9639c..144dcdfa9 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -16,7 +16,20 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 /// format used by milli.
 ///
 /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to
-/// iterate other the documents.
+/// iterate over the documents.
+///
+/// ## example:
+/// ```
+/// use milli::documents::DocumentBatchBuilder;
+/// use serde_json::json;
+/// use std::io::Cursor;
+///
+/// let json = r##"{"id": 1, "name": "foo"}"##;
+/// let mut writer = Cursor::new(Vec::new());
+/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
+/// builder.extend_from_json(Cursor::new(json.as_bytes())).unwrap();
+/// builder.finish().unwrap();
+/// ```
 pub struct DocumentBatchBuilder<W> {
     inner: ByteCounter<W>,
     index: DocumentsBatchIndex,

From 679fe18b17f7f1458cf336f8299be20ad2aca98f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= <clementine@meilisearch.com>
Date: Mon, 25 Oct 2021 11:52:17 +0200
Subject: [PATCH 08/22] Update version for the next release (v0.19.0)

---
 helpers/Cargo.toml | 2 +-
 http-ui/Cargo.toml | 2 +-
 infos/Cargo.toml   | 2 +-
 milli/Cargo.toml   | 2 +-
 search/Cargo.toml  | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml
index b3a15f100..341b8eb7c 100644
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.18.1"
+version = "0.19.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index a7af6fb9b..310388e01 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.18.1"
+version = "0.19.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/infos/Cargo.toml b/infos/Cargo.toml
index 2701c36d7..726fa9c5f 100644
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.18.1"
+version = "0.19.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 01309797d..12f474f12 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.18.1"
+version = "0.19.0"
 authors = ["Kerollmops <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/search/Cargo.toml b/search/Cargo.toml
index dd74ad7b6..d9441984e 100644
--- a/search/Cargo.toml
+++ b/search/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "search"
-version = "0.18.1"
+version = "0.19.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 

From 208903dddea1567f2c2c3afe719f0f512b9d0fe4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= <clementine@meilisearch.com>
Date: Mon, 25 Oct 2021 11:58:00 +0200
Subject: [PATCH 09/22] Revert "Replacing pest with nom "

---
 milli/Cargo.toml                           |   3 +-
 milli/src/error.rs                         |  11 +-
 milli/src/lib.rs                           |   3 +
 milli/src/search/facet/filter_condition.rs | 708 ++++++++++++++++++++-
 milli/src/search/facet/filter_parser.rs    | 622 ------------------
 milli/src/search/facet/grammar.pest        |  33 +
 milli/src/search/facet/mod.rs              |   5 +-
 milli/src/search/facet/parser.rs           |  12 +
 milli/src/search/mod.rs                    |   3 +-
 9 files changed, 736 insertions(+), 664 deletions(-)
 delete mode 100644 milli/src/search/facet/filter_parser.rs
 create mode 100644 milli/src/search/facet/grammar.pest
 create mode 100644 milli/src/search/facet/parser.rs

diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 594cc60e0..3792204e9 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -39,7 +39,8 @@ tempfile = "3.2.0"
 uuid = { version = "0.8.2", features = ["v4"] }
 
 # facet filter parser
-nom = "7.0.0"
+pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" }
+pest_derive = "2.1.0"
 
 # documents words self-join
 itertools = "0.10.0"
diff --git a/milli/src/error.rs b/milli/src/error.rs
index c0ce101c8..1f1cc5264 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -7,6 +7,7 @@ use heed::{Error as HeedError, MdbError};
 use rayon::ThreadPoolBuildError;
 use serde_json::{Map, Value};
 
+use crate::search::ParserRule;
 use crate::{CriterionError, DocumentId, FieldId, SortError};
 
 pub type Object = Map<String, Value>;
@@ -58,9 +59,9 @@ pub enum UserError {
     DocumentLimitReached,
     InvalidDocumentId { document_id: Value },
     InvalidFacetsDistribution { invalid_facets_name: HashSet<String> },
+    InvalidFilter(pest::error::Error<ParserRule>),
+    InvalidFilterAttribute(pest::error::Error<ParserRule>),
     InvalidGeoField { document_id: Value, object: Value },
-    InvalidFilter { input: String },
-    InvalidSortName { name: String },
     InvalidSortableAttribute { field: String, valid_fields: HashSet<String> },
     SortRankingRuleMissing,
     InvalidStoreFile,
@@ -207,7 +208,6 @@ impl StdError for InternalError {}
 impl fmt::Display for UserError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            Self::InvalidFilter { input } => write!(f, "parser error {}", input),
             Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"),
             Self::CriterionError(error) => write!(f, "{}", error),
             Self::DocumentLimitReached => f.write_str("maximum number of documents reached"),
@@ -220,6 +220,7 @@ impl fmt::Display for UserError {
                     name_list
                 )
             }
+            Self::InvalidFilter(error) => error.fmt(f),
             Self::InvalidGeoField { document_id, object } => write!(
                 f,
                 "the document with the id: {} contains an invalid _geo field: {}",
@@ -235,9 +236,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco
                     json
                 )
             }
-            Self::InvalidSortName { name } => {
-                write!(f, "Invalid syntax for the sort parameter: {}", name)
-            }
+            Self::InvalidFilterAttribute(error) => error.fmt(f),
             Self::InvalidSortableAttribute { field, valid_fields } => {
                 let valid_names =
                     valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
diff --git a/milli/src/lib.rs b/milli/src/lib.rs
index 6fe5947f5..781cedb2c 100644
--- a/milli/src/lib.rs
+++ b/milli/src/lib.rs
@@ -1,3 +1,6 @@
+#[macro_use]
+extern crate pest_derive;
+
 #[macro_use]
 pub mod documents;
 
diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs
index 4fedeee69..f1055b2f8 100644
--- a/milli/src/search/facet/filter_condition.rs
+++ b/milli/src/search/facet/filter_condition.rs
@@ -1,20 +1,60 @@
+use std::collections::HashSet;
 use std::fmt::Debug;
 use std::ops::Bound::{self, Excluded, Included};
+use std::result::Result as StdResult;
+use std::str::FromStr;
 
 use either::Either;
 use heed::types::DecodeIgnore;
+use itertools::Itertools;
 use log::debug;
-use nom::error::{convert_error, VerboseError};
+use pest::error::{Error as PestError, ErrorVariant};
+use pest::iterators::{Pair, Pairs};
+use pest::Parser;
 use roaring::RoaringBitmap;
 
 use self::FilterCondition::*;
-use super::filter_parser::{Operator, ParseContext};
+use self::Operator::*;
+use super::parser::{FilterParser, Rule, PREC_CLIMBER};
 use super::FacetNumberRange;
-use crate::error::{Error, UserError};
+use crate::error::UserError;
 use crate::heed_codec::facet::{
     FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
 };
-use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result};
+use crate::{
+    distance_between_two_points, CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result,
+};
+
+#[derive(Debug, Clone, PartialEq)]
+pub enum Operator {
+    GreaterThan(f64),
+    GreaterThanOrEqual(f64),
+    Equal(Option<f64>, String),
+    NotEqual(Option<f64>, String),
+    LowerThan(f64),
+    LowerThanOrEqual(f64),
+    Between(f64, f64),
+    GeoLowerThan([f64; 2], f64),
+    GeoGreaterThan([f64; 2], f64),
+}
+
+impl Operator {
+    /// This method can return two operations in case it must express
+    /// an OR operation for the between case (i.e. `TO`).
+    fn negate(self) -> (Self, Option<Self>) {
+        match self {
+            GreaterThan(n) => (LowerThanOrEqual(n), None),
+            GreaterThanOrEqual(n) => (LowerThan(n), None),
+            Equal(n, s) => (NotEqual(n, s), None),
+            NotEqual(n, s) => (Equal(n, s), None),
+            LowerThan(n) => (GreaterThanOrEqual(n), None),
+            LowerThanOrEqual(n) => (GreaterThan(n), None),
+            Between(n, m) => (LowerThan(n), Some(GreaterThan(m))),
+            GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None),
+            GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None),
+        }
+    }
+}
 
 #[derive(Debug, Clone, PartialEq)]
 pub enum FilterCondition {
@@ -36,7 +76,7 @@ impl FilterCondition {
         A: AsRef<str>,
         B: AsRef<str>,
     {
-        let mut ands: Option<FilterCondition> = None;
+        let mut ands = None;
 
         for either in array {
             match either {
@@ -77,23 +117,41 @@ impl FilterCondition {
     ) -> Result<FilterCondition> {
         let fields_ids_map = index.fields_ids_map(rtxn)?;
         let filterable_fields = index.filterable_fields(rtxn)?;
-        let ctx =
-            ParseContext { fields_ids_map: &fields_ids_map, filterable_fields: &filterable_fields };
-        match ctx.parse_expression::<VerboseError<&str>>(expression) {
-            Ok((_, fc)) => Ok(fc),
-            Err(e) => {
-                let ve = match e {
-                    nom::Err::Error(x) => x,
-                    nom::Err::Failure(x) => x,
-                    _ => unreachable!(),
-                };
-                Err(Error::UserError(UserError::InvalidFilter {
-                    input: convert_error(expression, ve).to_string(),
-                }))
-            }
-        }
+        let lexed =
+            FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?;
+        FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed)
     }
-    pub fn negate(self) -> FilterCondition {
+
+    fn from_pairs(
+        fim: &FieldsIdsMap,
+        ff: &HashSet<String>,
+        expression: Pairs<Rule>,
+    ) -> Result<Self> {
+        PREC_CLIMBER.climb(
+            expression,
+            |pair: Pair<Rule>| match pair.as_rule() {
+                Rule::greater => Ok(Self::greater_than(fim, ff, pair)?),
+                Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?),
+                Rule::eq => Ok(Self::equal(fim, ff, pair)?),
+                Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()),
+                Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?),
+                Rule::less => Ok(Self::lower_than(fim, ff, pair)?),
+                Rule::between => Ok(Self::between(fim, ff, pair)?),
+                Rule::geo_radius => Ok(Self::geo_radius(fim, ff, pair)?),
+                Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()),
+                Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()),
+                Rule::term => Self::from_pairs(fim, ff, pair.into_inner()),
+                _ => unreachable!(),
+            },
+            |lhs: Result<Self>, op: Pair<Rule>, rhs: Result<Self>| match op.as_rule() {
+                Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))),
+                Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))),
+                _ => unreachable!(),
+            },
+        )
+    }
+
+    fn negate(self) -> FilterCondition {
         match self {
             Operator(fid, op) => match op.negate() {
                 (op, None) => Operator(fid, op),
@@ -104,6 +162,189 @@ impl FilterCondition {
             Empty => Empty,
         }
     }
+
+    fn geo_radius(
+        fields_ids_map: &FieldsIdsMap,
+        filterable_fields: &HashSet<String>,
+        item: Pair<Rule>,
+    ) -> Result<FilterCondition> {
+        if !filterable_fields.contains("_geo") {
+            return Err(UserError::InvalidFilterAttribute(PestError::new_from_span(
+                ErrorVariant::CustomError {
+                    message: format!(
+                    "attribute `_geo` is not filterable, available filterable attributes are: {}",
+                    filterable_fields.iter().join(", "),
+                ),
+                },
+                item.as_span(),
+            )))?;
+        }
+        let mut items = item.into_inner();
+        let fid = match fields_ids_map.id("_geo") {
+            Some(fid) => fid,
+            None => return Ok(Empty),
+        };
+        let parameters_item = items.next().unwrap();
+        // We don't need more than 3 parameters, but to handle errors correctly we are still going
+        // to extract the first 4 parameters
+        let param_span = parameters_item.as_span();
+        let parameters = parameters_item
+            .into_inner()
+            .take(4)
+            .map(|param| (param.clone(), param.as_span()))
+            .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span)))
+            .collect::<StdResult<Vec<(f64, _)>, _>>()
+            .map_err(UserError::InvalidFilter)?;
+        if parameters.len() != 3 {
+            return Err(UserError::InvalidFilter(PestError::new_from_span(
+                        ErrorVariant::CustomError {
+                            message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"),
+                        },
+                        // we want to point to the last parameters and if there was no parameters we
+                        // point to the parenthesis
+                        parameters.last().map(|param| param.1.clone()).unwrap_or(param_span),
+            )))?;
+        }
+        let (lat, lng, distance) = (&parameters[0], &parameters[1], parameters[2].0);
+        if !(-90.0..=90.0).contains(&lat.0) {
+            return Err(UserError::InvalidFilter(PestError::new_from_span(
+                ErrorVariant::CustomError {
+                    message: format!("Latitude must be contained between -90 and 90 degrees."),
+                },
+                lat.1.clone(),
+            )))?;
+        } else if !(-180.0..=180.0).contains(&lng.0) {
+            return Err(UserError::InvalidFilter(PestError::new_from_span(
+                ErrorVariant::CustomError {
+                    message: format!("Longitude must be contained between -180 and 180 degrees."),
+                },
+                lng.1.clone(),
+            )))?;
+        }
+        Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance)))
+    }
+
+    fn between(
+        fields_ids_map: &FieldsIdsMap,
+        filterable_fields: &HashSet<String>,
+        item: Pair<Rule>,
+    ) -> Result<FilterCondition> {
+        let mut items = item.into_inner();
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
+            .map_err(UserError::InvalidFilterAttribute)?
+        {
+            Some(fid) => fid,
+            None => return Ok(Empty),
+        };
+
+        let (lresult, _) = pest_parse(items.next().unwrap());
+        let (rresult, _) = pest_parse(items.next().unwrap());
+
+        let lvalue = lresult.map_err(UserError::InvalidFilter)?;
+        let rvalue = rresult.map_err(UserError::InvalidFilter)?;
+
+        Ok(Operator(fid, Between(lvalue, rvalue)))
+    }
+
+    fn equal(
+        fields_ids_map: &FieldsIdsMap,
+        filterable_fields: &HashSet<String>,
+        item: Pair<Rule>,
+    ) -> Result<FilterCondition> {
+        let mut items = item.into_inner();
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
+            .map_err(UserError::InvalidFilterAttribute)?
+        {
+            Some(fid) => fid,
+            None => return Ok(Empty),
+        };
+
+        let value = items.next().unwrap();
+        let (result, svalue) = pest_parse(value);
+
+        let svalue = svalue.to_lowercase();
+        Ok(Operator(fid, Equal(result.ok(), svalue)))
+    }
+
+    fn greater_than(
+        fields_ids_map: &FieldsIdsMap,
+        filterable_fields: &HashSet<String>,
+        item: Pair<Rule>,
+    ) -> Result<FilterCondition> {
+        let mut items = item.into_inner();
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
+            .map_err(UserError::InvalidFilterAttribute)?
+        {
+            Some(fid) => fid,
+            None => return Ok(Empty),
+        };
+
+        let value = items.next().unwrap();
+        let (result, _svalue) = pest_parse(value);
+        let value = result.map_err(UserError::InvalidFilter)?;
+
+        Ok(Operator(fid, GreaterThan(value)))
+    }
+
+    fn greater_than_or_equal(
+        fields_ids_map: &FieldsIdsMap,
+        filterable_fields: &HashSet<String>,
+        item: Pair<Rule>,
+    ) -> Result<FilterCondition> {
+        let mut items = item.into_inner();
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
+            .map_err(UserError::InvalidFilterAttribute)?
+        {
+            Some(fid) => fid,
+            None => return Ok(Empty),
+        };
+
+        let value = items.next().unwrap();
+        let (result, _svalue) = pest_parse(value);
+        let value = result.map_err(UserError::InvalidFilter)?;
+
+        Ok(Operator(fid, GreaterThanOrEqual(value)))
+    }
+
+    fn lower_than(
+        fields_ids_map: &FieldsIdsMap,
+        filterable_fields: &HashSet<String>,
+        item: Pair<Rule>,
+    ) -> Result<FilterCondition> {
+        let mut items = item.into_inner();
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
+            .map_err(UserError::InvalidFilterAttribute)?
+        {
+            Some(fid) => fid,
+            None => return Ok(Empty),
+        };
+
+        let value = items.next().unwrap();
+        let (result, _svalue) = pest_parse(value);
+        let value = result.map_err(UserError::InvalidFilter)?;
+
+        Ok(Operator(fid, LowerThan(value)))
+    }
+
+    fn lower_than_or_equal(
+        fields_ids_map: &FieldsIdsMap,
+        filterable_fields: &HashSet<String>,
+        item: Pair<Rule>,
+    ) -> Result<FilterCondition> {
+        let mut items = item.into_inner();
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
+            .map_err(UserError::InvalidFilterAttribute)?
+        {
+            Some(fid) => fid,
+            None => return Ok(Empty),
+        };
+
+        let value = items.next().unwrap();
+        let (result, _svalue) = pest_parse(value);
+        let value = result.map_err(UserError::InvalidFilter)?;
+
+        Ok(Operator(fid, LowerThanOrEqual(value)))
+    }
 }
 
 impl FilterCondition {
@@ -227,9 +468,9 @@ impl FilterCondition {
         // as the facets values are all in the same database and prefixed by the
         // field id and the level.
         let (left, right) = match operator {
-            Operator::GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
-            Operator::GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
-            Operator::Equal(number, string) => {
+            GreaterThan(val) => (Excluded(*val), Included(f64::MAX)),
+            GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)),
+            Equal(number, string) => {
                 let (_original_value, string_docids) =
                     strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default();
                 let number_docids = match number {
@@ -251,23 +492,23 @@ impl FilterCondition {
                 };
                 return Ok(string_docids | number_docids);
             }
-            Operator::NotEqual(number, string) => {
+            NotEqual(number, string) => {
                 let all_numbers_ids = if number.is_some() {
                     index.number_faceted_documents_ids(rtxn, field_id)?
                 } else {
                     RoaringBitmap::new()
                 };
                 let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?;
-                let operator = Operator::Equal(*number, string.clone());
+                let operator = Equal(*number, string.clone());
                 let docids = Self::evaluate_operator(
                     rtxn, index, numbers_db, strings_db, field_id, &operator,
                 )?;
                 return Ok((all_numbers_ids | all_strings_ids) - docids);
             }
-            Operator::LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
-            Operator::LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
-            Operator::Between(left, right) => (Included(*left), Included(*right)),
-            Operator::GeoLowerThan(base_point, distance) => {
+            LowerThan(val) => (Included(f64::MIN), Excluded(*val)),
+            LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)),
+            Between(left, right) => (Included(*left), Included(*right)),
+            GeoLowerThan(base_point, distance) => {
                 let rtree = match index.geo_rtree(rtxn)? {
                     Some(rtree) => rtree,
                     None => return Ok(RoaringBitmap::new()),
@@ -283,14 +524,14 @@ impl FilterCondition {
 
                 return Ok(result);
             }
-            Operator::GeoGreaterThan(point, distance) => {
+            GeoGreaterThan(point, distance) => {
                 let result = Self::evaluate_operator(
                     rtxn,
                     index,
                     numbers_db,
                     strings_db,
                     field_id,
-                    &Operator::GeoLowerThan(point.clone(), *distance),
+                    &GeoLowerThan(point.clone(), *distance),
                 )?;
                 let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?;
                 return Ok(geo_faceted_doc_ids - result);
@@ -344,3 +585,406 @@ impl FilterCondition {
         }
     }
 }
+
+/// Retrieve the field id base on the pest value.
+///
+/// Returns an error if the given value is not filterable.
+///
+/// Returns Ok(None) if the given value is filterable, but is not yet ascociated to a field_id.
+///
+/// The pest pair is simply a string associated with a span, a location to highlight in
+/// the error message.
+fn field_id(
+    fields_ids_map: &FieldsIdsMap,
+    filterable_fields: &HashSet<String>,
+    items: &mut Pairs<Rule>,
+) -> StdResult<Option<FieldId>, PestError<Rule>> {
+    // lexing ensures that we at least have a key
+    let key = items.next().unwrap();
+    if key.as_rule() == Rule::reserved {
+        let message = match key.as_str() {
+            key if key.starts_with("_geoPoint") => {
+                format!(
+                    "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \
+                    Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.",
+                )
+            }
+            key @ "_geo" => {
+                format!(
+                    "`{}` is a reserved keyword and thus can't be used as a filter expression. \
+                    Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.",
+                    key
+                )
+            }
+            key => format!(
+                "`{}` is a reserved keyword and thus can't be used as a filter expression.",
+                key
+            ),
+        };
+        return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span()));
+    }
+
+    if !filterable_fields.contains(key.as_str()) {
+        return Err(PestError::new_from_span(
+            ErrorVariant::CustomError {
+                message: format!(
+                    "attribute `{}` is not filterable, available filterable attributes are: {}.",
+                    key.as_str(),
+                    filterable_fields.iter().join(", "),
+                ),
+            },
+            key.as_span(),
+        ));
+    }
+
+    Ok(fields_ids_map.id(key.as_str()))
+}
+
+/// Tries to parse the pest pair into the type `T` specified, always returns
+/// the original string that we tried to parse.
+///
+/// Returns the parsing error associated with the span if the conversion fails.
+fn pest_parse<T>(pair: Pair<Rule>) -> (StdResult<T, pest::error::Error<Rule>>, String)
+where
+    T: FromStr,
+    T::Err: ToString,
+{
+    let result = match pair.as_str().parse::<T>() {
+        Ok(value) => Ok(value),
+        Err(e) => Err(PestError::<Rule>::new_from_span(
+            ErrorVariant::CustomError { message: e.to_string() },
+            pair.as_span(),
+        )),
+    };
+
+    (result, pair.as_str().to_string())
+}
+
+#[cfg(test)]
+mod tests {
+    use big_s::S;
+    use heed::EnvOpenOptions;
+    use maplit::hashset;
+
+    use super::*;
+    use crate::update::Settings;
+
+    #[test]
+    fn string() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // Set the filterable fields to be the channel.
+        let mut wtxn = index.write_txn().unwrap();
+        let mut map = index.fields_ids_map(&wtxn).unwrap();
+        map.insert("channel");
+        index.put_fields_ids_map(&mut wtxn, &map).unwrap();
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        builder.set_filterable_fields(hashset! { S("channel") });
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Test that the facet condition is correctly generated.
+        let rtxn = index.read_txn().unwrap();
+        let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap();
+        let expected = Operator(0, Operator::Equal(None, S("ponce")));
+        assert_eq!(condition, expected);
+
+        let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
+        let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
+        assert_eq!(condition, expected);
+
+        let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
+        let expected = Operator(0, Operator::NotEqual(None, S("ponce")));
+        assert_eq!(condition, expected);
+    }
+
+    #[test]
+    fn number() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // Set the filterable fields to be the channel.
+        let mut wtxn = index.write_txn().unwrap();
+        let mut map = index.fields_ids_map(&wtxn).unwrap();
+        map.insert("timestamp");
+        index.put_fields_ids_map(&mut wtxn, &map).unwrap();
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        builder.set_filterable_fields(hashset! { "timestamp".into() });
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Test that the facet condition is correctly generated.
+        let rtxn = index.read_txn().unwrap();
+        let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
+        let expected = Operator(0, Between(22.0, 44.0));
+        assert_eq!(condition, expected);
+
+        let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
+        let expected =
+            Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0))));
+        assert_eq!(condition, expected);
+    }
+
+    #[test]
+    fn parentheses() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // Set the filterable fields to be the channel.
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
+        builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Test that the facet condition is correctly generated.
+        let rtxn = index.read_txn().unwrap();
+        let condition = FilterCondition::from_str(
+            &rtxn,
+            &index,
+            "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
+        )
+        .unwrap();
+        let expected = Or(
+            Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
+            Box::new(And(
+                Box::new(Operator(1, Between(22.0, 44.0))),
+                Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))),
+            )),
+        );
+        assert_eq!(condition, expected);
+
+        let condition = FilterCondition::from_str(
+            &rtxn,
+            &index,
+            "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
+        )
+        .unwrap();
+        let expected = Or(
+            Box::new(Operator(0, Operator::Equal(None, S("gotaga")))),
+            Box::new(Or(
+                Box::new(Or(
+                    Box::new(Operator(1, LowerThan(22.0))),
+                    Box::new(Operator(1, GreaterThan(44.0))),
+                )),
+                Box::new(Operator(0, Operator::Equal(None, S("ponce")))),
+            )),
+        );
+        assert_eq!(condition, expected);
+    }
+
+    #[test]
+    fn reserved_field_names() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+        let rtxn = index.read_txn().unwrap();
+
+        let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."),
+            "{}",
+            error.to_string()
+        );
+
+        let error =
+            FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."),
+            "{}",
+            error.to_string()
+        );
+
+        let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."),
+            "{}",
+            error.to_string()
+        );
+
+        let error =
+            FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."),
+            "{}",
+            error.to_string()
+        );
+    }
+
+    #[test]
+    fn geo_radius() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // Set the filterable fields to be the channel.
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        // _geo is not filterable
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("attribute `_geo` is not filterable, available filterable attributes are:"),);
+
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        builder.set_filterable_fields(hashset! { S("_geo"), S("price") });
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        let rtxn = index.read_txn().unwrap();
+        // basic test
+        let condition =
+            FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap();
+        let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.));
+        assert_eq!(condition, expected);
+
+        // basic test with latitude and longitude at the max angle
+        let condition =
+            FilterCondition::from_str(&rtxn, &index, "_geoRadius(90, 180, 2000)").unwrap();
+        let expected = Operator(0, GeoLowerThan([90., 180.], 2000.));
+        assert_eq!(condition, expected);
+
+        // basic test with latitude and longitude at the min angle
+        let condition =
+            FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90, -180, 2000)").unwrap();
+        let expected = Operator(0, GeoLowerThan([-90., -180.], 2000.));
+        assert_eq!(condition, expected);
+
+        // test the negation of the GeoLowerThan
+        let condition =
+            FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap();
+        let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500));
+        assert_eq!(condition, expected);
+
+        // composition of multiple operations
+        let condition = FilterCondition::from_str(
+            &rtxn,
+            &index,
+            "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10",
+        )
+        .unwrap();
+        let expected = Or(
+            Box::new(And(
+                Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))),
+                Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))),
+            )),
+            Box::new(Operator(1, LowerThanOrEqual(10.))),
+        );
+        assert_eq!(condition, expected);
+
+        // georadius don't have any parameters
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+
+        // georadius don't have any parameters
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+
+        // georadius don't have enough parameters
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+
+        // georadius have too many parameters
+        let result =
+            FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+
+        // georadius have a bad latitude
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("Latitude must be contained between -90 and 90 degrees."));
+
+        // georadius have a bad latitude
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("Latitude must be contained between -90 and 90 degrees."));
+
+        // georadius have a bad longitude
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("Longitude must be contained between -180 and 180 degrees."));
+
+        // georadius have a bad longitude
+        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)");
+        assert!(result.is_err());
+        let error = result.unwrap_err();
+        assert!(error
+            .to_string()
+            .contains("Longitude must be contained between -180 and 180 degrees."));
+    }
+
+    #[test]
+    fn from_array() {
+        let path = tempfile::tempdir().unwrap();
+        let mut options = EnvOpenOptions::new();
+        options.map_size(10 * 1024 * 1024); // 10 MB
+        let index = Index::new(options, &path).unwrap();
+
+        // Set the filterable fields to be the channel.
+        let mut wtxn = index.write_txn().unwrap();
+        let mut builder = Settings::new(&mut wtxn, &index, 0);
+        builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
+        builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
+        builder.execute(|_, _| ()).unwrap();
+        wtxn.commit().unwrap();
+
+        // Test that the facet condition is correctly generated.
+        let rtxn = index.read_txn().unwrap();
+        let condition = FilterCondition::from_array(
+            &rtxn,
+            &index,
+            vec![
+                Either::Right("channel = gotaga"),
+                Either::Left(vec!["timestamp = 44", "channel != ponce"]),
+            ],
+        )
+        .unwrap()
+        .unwrap();
+        let expected = FilterCondition::from_str(
+            &rtxn,
+            &index,
+            "channel = gotaga AND (timestamp = 44 OR channel != ponce)",
+        )
+        .unwrap();
+        assert_eq!(condition, expected);
+    }
+}
diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs
deleted file mode 100644
index 4d8a54987..000000000
--- a/milli/src/search/facet/filter_parser.rs
+++ /dev/null
@@ -1,622 +0,0 @@
-use std::collections::HashSet;
-use std::fmt::Debug;
-use std::result::Result as StdResult;
-
-use nom::branch::alt;
-use nom::bytes::complete::{tag, take_while1};
-use nom::character::complete::{char, multispace0};
-use nom::combinator::map;
-use nom::error::{ContextError, ErrorKind, VerboseError};
-use nom::multi::{many0, separated_list1};
-use nom::sequence::{delimited, preceded, tuple};
-use nom::IResult;
-
-use self::Operator::*;
-use super::FilterCondition;
-use crate::{FieldId, FieldsIdsMap};
-#[derive(Debug, Clone, PartialEq)]
-pub enum Operator {
-    GreaterThan(f64),
-    GreaterThanOrEqual(f64),
-    Equal(Option<f64>, String),
-    NotEqual(Option<f64>, String),
-    LowerThan(f64),
-    LowerThanOrEqual(f64),
-    Between(f64, f64),
-    GeoLowerThan([f64; 2], f64),
-    GeoGreaterThan([f64; 2], f64),
-}
-
-impl Operator {
-    /// This method can return two operations in case it must express
-    /// an OR operation for the between case (i.e. `TO`).
-    pub fn negate(self) -> (Self, Option<Self>) {
-        match self {
-            GreaterThan(n) => (LowerThanOrEqual(n), None),
-            GreaterThanOrEqual(n) => (LowerThan(n), None),
-            Equal(n, s) => (NotEqual(n, s), None),
-            NotEqual(n, s) => (Equal(n, s), None),
-            LowerThan(n) => (GreaterThanOrEqual(n), None),
-            LowerThanOrEqual(n) => (GreaterThan(n), None),
-            Between(n, m) => (LowerThan(n), Some(GreaterThan(m))),
-            GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None),
-            GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None),
-        }
-    }
-}
-
-pub trait FilterParserError<'a>:
-    nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug
-{
-}
-
-impl<'a> FilterParserError<'a> for VerboseError<&'a str> {}
-
-pub struct ParseContext<'a> {
-    pub fields_ids_map: &'a FieldsIdsMap,
-    pub filterable_fields: &'a HashSet<String>,
-}
-
-impl<'a> ParseContext<'a> {
-    fn parse_or<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let (input, lhs) = self.parse_and(input)?;
-        let (input, ors) = many0(preceded(self.ws(tag("OR")), |c| Self::parse_or(self, c)))(input)?;
-
-        let expr = ors
-            .into_iter()
-            .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch)));
-        Ok((input, expr))
-    }
-
-    fn parse_and<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let (input, lhs) = self.parse_not(input)?;
-        let (input, ors) =
-            many0(preceded(self.ws(tag("AND")), |c| Self::parse_and(self, c)))(input)?;
-        let expr = ors
-            .into_iter()
-            .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch)));
-        Ok((input, expr))
-    }
-
-    fn parse_not<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        alt((
-            map(
-                preceded(alt((self.ws(tag("!")), self.ws(tag("NOT")))), |c| {
-                    Self::parse_condition_expression(self, c)
-                }),
-                |e| e.negate(),
-            ),
-            |c| Self::parse_condition_expression(self, c),
-        ))(input)
-    }
-
-    fn ws<F, O, E>(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E>
-    where
-        F: Fn(&'a str) -> IResult<&'a str, O, E>,
-        E: FilterParserError<'a>,
-    {
-        delimited(multispace0, inner, multispace0)
-    }
-
-    fn parse_simple_condition<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let operator = alt((tag("<="), tag(">="), tag(">"), tag("="), tag("<"), tag("!=")));
-        let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))(
-            input,
-        );
-        let (input, (key, op, value)) = match k {
-            Ok(o) => o,
-            Err(e) => {
-                return Err(e);
-            }
-        };
-
-        let fid = self.parse_fid(input, key)?;
-        let r: StdResult<f64, nom::Err<VerboseError<&str>>> = self.parse_numeric(value);
-        let k = match op {
-            "=" => FilterCondition::Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())),
-            "!=" => {
-                FilterCondition::Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase()))
-            }
-            ">" | "<" | "<=" | ">=" => return self.parse_numeric_unary_condition(op, fid, value),
-            _ => unreachable!(),
-        };
-        Ok((input, k))
-    }
-
-    fn parse_numeric<E, T>(&'a self, input: &'a str) -> StdResult<T, nom::Err<E>>
-    where
-        E: FilterParserError<'a>,
-        T: std::str::FromStr,
-    {
-        match input.parse::<T>() {
-            Ok(n) => Ok(n),
-            Err(_) => {
-                return match input.chars().nth(0) {
-                    Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))),
-                    None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))),
-                };
-            }
-        }
-    }
-
-    fn parse_numeric_unary_condition<E>(
-        &'a self,
-        input: &'a str,
-        fid: u16,
-        value: &'a str,
-    ) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let numeric: f64 = self.parse_numeric(value)?;
-        let k = match input {
-            ">" => FilterCondition::Operator(fid, GreaterThan(numeric)),
-            "<" => FilterCondition::Operator(fid, LowerThan(numeric)),
-            "<=" => FilterCondition::Operator(fid, LowerThanOrEqual(numeric)),
-            ">=" => FilterCondition::Operator(fid, GreaterThanOrEqual(numeric)),
-            _ => unreachable!(),
-        };
-        Ok((input, k))
-    }
-
-    fn parse_fid<E>(&'a self, input: &'a str, key: &'a str) -> StdResult<FieldId, nom::Err<E>>
-    where
-        E: FilterParserError<'a>,
-    {
-        let error = match input.chars().nth(0) {
-            Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))),
-            None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))),
-        };
-        if !self.filterable_fields.contains(key) {
-            return error;
-        }
-        match self.fields_ids_map.id(key) {
-            Some(fid) => Ok(fid),
-            None => error,
-        }
-    }
-
-    fn parse_range_condition<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let (input, (key, from, _, to)) = tuple((
-            self.ws(|c| self.parse_key(c)),
-            self.ws(|c| self.parse_key(c)),
-            tag("TO"),
-            self.ws(|c| self.parse_key(c)),
-        ))(input)?;
-
-        let fid = self.parse_fid(input, key)?;
-        let numeric_from: f64 = self.parse_numeric(from)?;
-        let numeric_to: f64 = self.parse_numeric(to)?;
-        let res = FilterCondition::Operator(fid, Between(numeric_from, numeric_to));
-
-        Ok((input, res))
-    }
-
-    fn parse_geo_radius<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let err_msg_args_incomplete= "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`";
-        let err_msg_latitude_invalid =
-            "_geoRadius. Latitude must be contained between -90 and 90 degrees.";
-
-        let err_msg_longitude_invalid =
-            "_geoRadius. Longitude must be contained between -180 and 180 degrees.";
-
-        let (input, args): (&str, Vec<&str>) = match preceded(
-            tag("_geoRadius"),
-            delimited(
-                char('('),
-                separated_list1(tag(","), self.ws(|c| self.parse_value::<E>(c))),
-                char(')'),
-            ),
-        )(input)
-        {
-            Ok(e) => e,
-            Err(_e) => {
-                return Err(nom::Err::Failure(E::add_context(
-                    input,
-                    err_msg_args_incomplete,
-                    E::from_char(input, '('),
-                )));
-            }
-        };
-
-        if args.len() != 3 {
-            let e = E::from_char(input, '(');
-            return Err(nom::Err::Failure(E::add_context(input, err_msg_args_incomplete, e)));
-        }
-        let lat = self.parse_numeric(args[0])?;
-        let lng = self.parse_numeric(args[1])?;
-        let dis = self.parse_numeric(args[2])?;
-
-        let fid = match self.fields_ids_map.id("_geo") {
-            Some(fid) => fid,
-            None => return Ok((input, FilterCondition::Empty)),
-        };
-
-        if !(-90.0..=90.0).contains(&lat) {
-            return Err(nom::Err::Failure(E::add_context(
-                input,
-                err_msg_latitude_invalid,
-                E::from_char(input, '('),
-            )));
-        } else if !(-180.0..=180.0).contains(&lng) {
-            return Err(nom::Err::Failure(E::add_context(
-                input,
-                err_msg_longitude_invalid,
-                E::from_char(input, '('),
-            )));
-        }
-
-        let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis));
-        Ok((input, res))
-    }
-
-    fn parse_condition<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let l1 = |c| self.parse_simple_condition(c);
-        let l2 = |c| self.parse_range_condition(c);
-        let l3 = |c| self.parse_geo_radius(c);
-        alt((l1, l2, l3))(input)
-    }
-
-    fn parse_condition_expression<E>(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        alt((
-            delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))),
-            |c| Self::parse_condition(self, c),
-        ))(input)
-    }
-
-    fn parse_key<E>(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let key = |input| take_while1(Self::is_key_component)(input);
-        alt((key, delimited(char('"'), key, char('"'))))(input)
-    }
-
-    fn parse_value<E>(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        let key = |input| take_while1(Self::is_key_component)(input);
-        alt((key, delimited(char('"'), key, char('"'))))(input)
-    }
-
-    fn is_key_component(c: char) -> bool {
-        c.is_alphanumeric() || ['_', '-', '.'].contains(&c)
-    }
-
-    pub fn parse_expression<E>(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E>
-    where
-        E: FilterParserError<'a>,
-    {
-        self.parse_or(input)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use big_s::S;
-    use either::Either;
-    use heed::EnvOpenOptions;
-    use maplit::hashset;
-
-    use super::*;
-    use crate::update::Settings;
-    use crate::Index;
-
-    #[test]
-    fn string() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        // Set the filterable fields to be the channel.
-        let mut wtxn = index.write_txn().unwrap();
-        let mut map = index.fields_ids_map(&wtxn).unwrap();
-        map.insert("channel");
-        index.put_fields_ids_map(&mut wtxn, &map).unwrap();
-        let mut builder = Settings::new(&mut wtxn, &index, 0);
-        builder.set_filterable_fields(hashset! { S("channel") });
-        builder.execute(|_, _| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        // Test that the facet condition is correctly generated.
-        let rtxn = index.read_txn().unwrap();
-        let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap();
-        let expected = FilterCondition::Operator(0, Operator::Equal(None, S("ponce")));
-        assert_eq!(condition, expected);
-
-        let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap();
-        let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")));
-        assert_eq!(condition, expected);
-
-        let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap();
-        let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")));
-        assert_eq!(condition, expected);
-    }
-
-    #[test]
-    fn number() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        // Set the filterable fields to be the channel.
-        let mut wtxn = index.write_txn().unwrap();
-        let mut map = index.fields_ids_map(&wtxn).unwrap();
-        map.insert("timestamp");
-        index.put_fields_ids_map(&mut wtxn, &map).unwrap();
-        let mut builder = Settings::new(&mut wtxn, &index, 0);
-        builder.set_filterable_fields(hashset! { "timestamp".into() });
-        builder.execute(|_, _| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        // Test that the facet condition is correctly generated.
-        let rtxn = index.read_txn().unwrap();
-        let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap();
-        let expected = FilterCondition::Operator(0, Between(22.0, 44.0));
-        assert_eq!(condition, expected);
-
-        let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap();
-        let expected = FilterCondition::Or(
-            Box::new(FilterCondition::Operator(0, LowerThan(22.0))),
-            Box::new(FilterCondition::Operator(0, GreaterThan(44.0))),
-        );
-        assert_eq!(condition, expected);
-    }
-
-    #[test]
-    fn compare() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = Settings::new(&mut wtxn, &index, 0);
-        builder.set_searchable_fields(vec![S("channel"), S("timestamp"), S("id")]); // to keep the fields order
-        builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")});
-        builder.execute(|_, _| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-        let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap();
-        let expected = FilterCondition::Operator(0, LowerThan(20.0));
-        assert_eq!(condition, expected);
-
-        let rtxn = index.read_txn().unwrap();
-        let condition = FilterCondition::from_str(&rtxn, &index, "id < 200").unwrap();
-        let expected = FilterCondition::Operator(2, LowerThan(200.0));
-        assert_eq!(condition, expected);
-    }
-
-    #[test]
-    fn parentheses() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        // Set the filterable fields to be the channel.
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = Settings::new(&mut wtxn, &index, 0);
-        builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
-        builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
-        builder.execute(|_, _| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        // Test that the facet condition is correctly generated.
-        let rtxn = index.read_txn().unwrap();
-        let condition = FilterCondition::from_str(
-            &rtxn,
-            &index,
-            "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)",
-        )
-        .unwrap();
-        let expected = FilterCondition::Or(
-            Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))),
-            Box::new(FilterCondition::And(
-                Box::new(FilterCondition::Operator(1, Between(22.0, 44.0))),
-                Box::new(FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")))),
-            )),
-        );
-        assert_eq!(condition, expected);
-
-        let condition = FilterCondition::from_str(
-            &rtxn,
-            &index,
-            "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)",
-        )
-        .unwrap();
-        let expected = FilterCondition::Or(
-            Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))),
-            Box::new(FilterCondition::Or(
-                Box::new(FilterCondition::Or(
-                    Box::new(FilterCondition::Operator(1, LowerThan(22.0))),
-                    Box::new(FilterCondition::Operator(1, GreaterThan(44.0))),
-                )),
-                Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("ponce")))),
-            )),
-        );
-        assert_eq!(condition, expected);
-    }
-
-    #[test]
-    fn from_array() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        // Set the filterable fields to be the channel.
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = Settings::new(&mut wtxn, &index, 0);
-        builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order
-        builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") });
-        builder.execute(|_, _| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        // Test that the facet condition is correctly generated.
-        let rtxn = index.read_txn().unwrap();
-        let condition = FilterCondition::from_array(
-            &rtxn,
-            &index,
-            vec![
-                Either::Right("channel = gotaga"),
-                Either::Left(vec!["timestamp = 44", "channel != ponce"]),
-            ],
-        )
-        .unwrap()
-        .unwrap();
-        let expected = FilterCondition::from_str(
-            &rtxn,
-            &index,
-            "channel = gotaga AND (timestamp = 44 OR channel != ponce)",
-        )
-        .unwrap();
-        assert_eq!(condition, expected);
-    }
-    #[test]
-    fn geo_radius() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        // Set the filterable fields to be the channel.
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = Settings::new(&mut wtxn, &index, 0);
-        builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order
-        builder.set_filterable_fields(hashset! { S("_geo"), S("price") });
-        builder.execute(|_, _| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-        // basic test
-        let condition =
-            FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap();
-        let expected = FilterCondition::Operator(0, GeoLowerThan([12., 13.0005], 2000.));
-        assert_eq!(condition, expected);
-
-        // test the negation of the GeoLowerThan
-        let condition =
-            FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap();
-        let expected = FilterCondition::Operator(0, GeoGreaterThan([50., 18.], 2000.500));
-        assert_eq!(condition, expected);
-
-        // composition of multiple operations
-        let condition = FilterCondition::from_str(
-            &rtxn,
-            &index,
-            "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10",
-        )
-        .unwrap();
-        let expected = FilterCondition::Or(
-            Box::new(FilterCondition::And(
-                Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))),
-                Box::new(FilterCondition::Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))),
-            )),
-            Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))),
-        );
-        assert_eq!(condition, expected);
-    }
-
-    #[test]
-    fn geo_radius_error() {
-        let path = tempfile::tempdir().unwrap();
-        let mut options = EnvOpenOptions::new();
-        options.map_size(10 * 1024 * 1024); // 10 MB
-        let index = Index::new(options, &path).unwrap();
-
-        // Set the filterable fields to be the channel.
-        let mut wtxn = index.write_txn().unwrap();
-        let mut builder = Settings::new(&mut wtxn, &index, 0);
-        builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order
-        builder.set_filterable_fields(hashset! { S("_geo"), S("price") });
-        builder.execute(|_, _| ()).unwrap();
-        wtxn.commit().unwrap();
-
-        let rtxn = index.read_txn().unwrap();
-
-        // georadius don't have any parameters
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
-
-        // georadius don't have any parameters
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
-
-        // georadius don't have enough parameters
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
-
-        // georadius have too many parameters
-        let result =
-            FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
-
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("Latitude must be contained between -90 and 90 degrees."));
-
-        // georadius have a bad latitude
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("Latitude must be contained between -90 and 90 degrees."));
-
-        // georadius have a bad longitude
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("Longitude must be contained between -180 and 180 degrees."));
-
-        // georadius have a bad longitude
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("Longitude must be contained between -180 and 180 degrees."));
-    }
-}
diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest
new file mode 100644
index 000000000..8bfdeb667
--- /dev/null
+++ b/milli/src/search/facet/grammar.pest
@@ -0,0 +1,33 @@
+key = _{reserved | quoted | word }
+value = _{quoted | word }
+quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP  }
+string = {char*}
+word = ${(LETTER | NUMBER | "_" | "-" | ".")+}
+
+char =  _{ !(PEEK | "\\") ~ ANY
+    | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t")
+    | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})}
+
+reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" }
+// we deliberately choose to allow empty parameters to generate more specific error message later
+parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""}
+condition = _{between | eq | greater | less | geq | leq | neq}
+between = {key ~ value ~ "TO" ~ value}
+geq = {key ~ ">=" ~ value}
+leq = {key ~ "<=" ~ value}
+neq = {key ~ "!=" ~ value}
+eq = {key ~ "=" ~ value}
+greater = {key ~ ">" ~ value}
+less = {key ~ "<" ~ value}
+geo_radius = {"_geoRadius" ~ parameters }
+
+prgm = {SOI ~ expr ~ EOI}
+expr = _{ ( term ~ (operation ~ term)* ) }
+term = { ("(" ~ expr ~ ")") | condition | not | geo_radius }
+operation = _{ and | or }
+and = {"AND"}
+or = {"OR"}
+
+not = {"NOT" ~ term}
+
+WHITESPACE = _{ " " }
diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs
index 3efa0262f..ddf710e32 100644
--- a/milli/src/search/facet/mod.rs
+++ b/milli/src/search/facet/mod.rs
@@ -1,10 +1,11 @@
 pub use self::facet_distribution::FacetDistribution;
 pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange};
 pub use self::facet_string::FacetStringIter;
-pub use self::filter_condition::FilterCondition;
+pub use self::filter_condition::{FilterCondition, Operator};
+pub(crate) use self::parser::Rule as ParserRule;
 
 mod facet_distribution;
 mod facet_number;
 mod facet_string;
 mod filter_condition;
-mod filter_parser;
+mod parser;
diff --git a/milli/src/search/facet/parser.rs b/milli/src/search/facet/parser.rs
new file mode 100644
index 000000000..1bff27cfb
--- /dev/null
+++ b/milli/src/search/facet/parser.rs
@@ -0,0 +1,12 @@
+use once_cell::sync::Lazy;
+use pest::prec_climber::{Assoc, Operator, PrecClimber};
+
+pub static PREC_CLIMBER: Lazy<PrecClimber<Rule>> = Lazy::new(|| {
+    use Assoc::*;
+    use Rule::*;
+    pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)])
+});
+
+#[derive(Parser)]
+#[grammar = "search/facet/grammar.pest"]
+pub struct FilterParser;
diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
index 9b76ca851..bec059d46 100644
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -14,7 +14,8 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig};
 use once_cell::sync::Lazy;
 use roaring::bitmap::RoaringBitmap;
 
-pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition};
+pub(crate) use self::facet::ParserRule;
+pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator};
 pub use self::matching_words::MatchingWords;
 use self::query_tree::QueryTreeBuilder;
 use crate::error::UserError;

From f9445c1d9059a77274f70508e2d0c896d9404bbb Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Mon, 25 Oct 2021 16:07:57 +0200
Subject: [PATCH 10/22] return float parsing error context in csv

---
 milli/src/documents/builder.rs | 11 +++++++----
 milli/src/documents/mod.rs     | 14 ++++++--------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 144dcdfa9..c2b0e01cc 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -115,14 +115,17 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
             .map(|(k, t)| (this.index.insert(&k), t))
             .collect::<BTreeMap<_, _>>();
 
-        let records = records.into_records();
-
-        for record in records {
+        for (i, record) in records.into_records().enumerate() {
             let record = record?;
             let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
             for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
                 let value = match ty {
-                    AllowedType::Number => value.parse::<f64>().map(Value::from)?,
+                    AllowedType::Number => value.parse::<f64>().map(Value::from).map_err(|error| Error::ParseFloat {
+                        error,
+                        // +1 for the header offset.
+                        line: i + 1,
+                        value: value.to_string(),
+                    })?,
                     AllowedType::String => Value::String(value.to_string()),
                 };
 
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index b61a3326b..6f653d825 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -83,7 +83,11 @@ impl<W: io::Write> io::Write for ByteCounter<W> {
 
 #[derive(Debug)]
 pub enum Error {
-    ParseFloat(std::num::ParseFloatError),
+    ParseFloat {
+        error: std::num::ParseFloatError,
+        line: usize,
+        value: String,
+    },
     InvalidDocumentFormat,
     Custom(String),
     JsonError(serde_json::Error),
@@ -117,16 +121,10 @@ impl From<serde_json::Error> for Error {
     }
 }
 
-impl From<ParseFloatError> for Error {
-    fn from(other: ParseFloatError) -> Self {
-        Self::ParseFloat(other)
-    }
-}
-
 impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            Error::ParseFloat(e) => write!(f, "{}", e),
+            Error::ParseFloat { error, line, value} => write!(f, "Error parsing number {:?} at line {}: {}", value, line, error),
             Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
             Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
             Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),

From baddd80069b26d66421ce68a7a2e004e148646b0 Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Mon, 25 Oct 2021 17:38:32 +0200
Subject: [PATCH 11/22] implement review suggestions

---
 benchmarks/benches/utils.rs                   |  32 +-
 cli/src/main.rs                               |  28 +-
 http-ui/src/documents_from_csv.rs             | 285 ------------------
 http-ui/src/main.rs                           |  28 +-
 milli/src/documents/builder.rs                |  54 ++--
 milli/src/documents/mod.rs                    |  17 +-
 milli/src/documents/serde.rs                  |  39 +--
 milli/src/update/index_documents/mod.rs       |   3 +-
 milli/src/update/index_documents/transform.rs |   3 +-
 9 files changed, 89 insertions(+), 400 deletions(-)
 delete mode 100644 http-ui/src/documents_from_csv.rs

diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs
index 24f5d5343..dbe8fffad 100644
--- a/benchmarks/benches/utils.rs
+++ b/benchmarks/benches/utils.rs
@@ -1,7 +1,7 @@
 #![allow(dead_code)]
 
 use std::fs::{create_dir_all, remove_dir_all, File};
-use std::io::{self, Cursor, Read, Seek};
+use std::io::{self, BufRead, BufReader, Cursor, Read, Seek};
 use std::num::ParseFloatError;
 use std::path::Path;
 
@@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader<imp
     DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap()
 }
 
-fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_jsonl(reader: impl Read) -> anyhow::Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
 
-    let values = serde_json::Deserializer::from_reader(reader)
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
-    for document in values {
-        let document = document?;
-        documents.add_documents(document)?;
+    let mut buf = String::new();
+    let mut reader = BufReader::new(reader);
+
+    while reader.read_line(&mut buf)? > 0 {
+        documents.extend_from_json(&mut buf.as_bytes())?;
     }
     documents.finish()?;
 
     Ok(writer.into_inner())
 }
 
-fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_json(reader: impl Read) -> anyhow::Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
 
-    let json: serde_json::Value = serde_json::from_reader(reader)?;
-    documents.add_documents(json)?;
+    documents.extend_from_json(reader)?;
     documents.finish()?;
 
     Ok(writer.into_inner())
 }
 
-fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
+fn documents_from_csv(reader: impl Read) -> anyhow::Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-
-    let iter = CSVDocumentDeserializer::from_reader(reader)?;
-
-    for doc in iter {
-        let doc = doc?;
-        documents.add_documents(doc)?;
-    }
-
-    documents.finish()?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
 
     Ok(writer.into_inner())
 }
diff --git a/cli/src/main.rs b/cli/src/main.rs
index b84ff3243..8e28d4a25 100644
--- a/cli/src/main.rs
+++ b/cli/src/main.rs
@@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io::{stdin, Cursor, Read};
+use std::io::{stdin, BufRead, BufReader, Cursor, Read};
 use std::path::PathBuf;
 use std::str::FromStr;
 
@@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle};
 use milli::update::UpdateIndexingStep::{
     ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition,
 };
-use serde_json::{Map, Value};
 use structopt::StructOpt;
 
 #[cfg(target_os = "linux")]
@@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
 
-    let values = serde_json::Deserializer::from_reader(reader)
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
-    for document in values {
-        let document = document?;
-        documents.add_documents(document)?;
+    let mut buf = String::new();
+    let mut reader = BufReader::new(reader);
+
+    while reader.read_line(&mut buf)? > 0 {
+        documents.extend_from_json(&mut buf.as_bytes())?;
     }
     documents.finish()?;
 
@@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
 
-    let json: serde_json::Value = serde_json::from_reader(reader)?;
-    documents.add_documents(json)?;
+    documents.extend_from_json(reader)?;
     documents.finish()?;
 
     Ok(writer.into_inner())
@@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result<Vec<u8>> {
 
 fn documents_from_csv(reader: impl Read) -> Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-
-    let mut records = csv::Reader::from_reader(reader);
-    let iter = records.deserialize::<Map<String, Value>>();
-
-    for doc in iter {
-        let doc = doc?;
-        documents.add_documents(doc)?;
-    }
-
-    documents.finish()?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
 
     Ok(writer.into_inner())
 }
diff --git a/http-ui/src/documents_from_csv.rs b/http-ui/src/documents_from_csv.rs
deleted file mode 100644
index 2b62f23c2..000000000
--- a/http-ui/src/documents_from_csv.rs
+++ /dev/null
@@ -1,285 +0,0 @@
-use std::io::{Read, Result as IoResult};
-use std::num::ParseFloatError;
-
-use serde_json::{Map, Value};
-
-enum AllowedType {
-    String,
-    Number,
-}
-
-fn parse_csv_header(header: &str) -> (String, AllowedType) {
-    // if there are several separators we only split on the last one.
-    match header.rsplit_once(':') {
-        Some((field_name, field_type)) => match field_type {
-            "string" => (field_name.to_string(), AllowedType::String),
-            "number" => (field_name.to_string(), AllowedType::Number),
-            // we may return an error in this case.
-            _otherwise => (header.to_string(), AllowedType::String),
-        },
-        None => (header.to_string(), AllowedType::String),
-    }
-}
-
-pub struct CSVDocumentDeserializer<R>
-where
-    R: Read,
-{
-    documents: csv::StringRecordsIntoIter<R>,
-    headers: Vec<(String, AllowedType)>,
-}
-
-impl<R: Read> CSVDocumentDeserializer<R> {
-    pub fn from_reader(reader: R) -> IoResult<Self> {
-        let mut records = csv::Reader::from_reader(reader);
-
-        let headers = records.headers()?.into_iter().map(parse_csv_header).collect();
-
-        Ok(Self { documents: records.into_records(), headers })
-    }
-}
-
-impl<R: Read> Iterator for CSVDocumentDeserializer<R> {
-    type Item = anyhow::Result<Map<String, Value>>;
-
-    fn next(&mut self) -> Option<Self::Item> {
-        let csv_document = self.documents.next()?;
-
-        match csv_document {
-            Ok(csv_document) => {
-                let mut document = Map::new();
-
-                for ((field_name, field_type), value) in
-                    self.headers.iter().zip(csv_document.into_iter())
-                {
-                    let parsed_value: Result<Value, ParseFloatError> = match field_type {
-                        AllowedType::Number => {
-                            value.parse::<f64>().map(Value::from).map_err(Into::into)
-                        }
-                        AllowedType::String => Ok(Value::String(value.to_string())),
-                    };
-
-                    match parsed_value {
-                        Ok(value) => drop(document.insert(field_name.to_string(), value)),
-                        Err(_e) => {
-                            return Some(Err(anyhow::anyhow!(
-                                "Value '{}' is not a valid number",
-                                value
-                            )))
-                        }
-                    }
-                }
-
-                Some(Ok(document))
-            }
-            Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))),
-        }
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use serde_json::json;
-
-    use super::*;
-
-    #[test]
-    fn simple_csv_document() {
-        let documents = r#"city,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn coma_in_field() {
-        let documents = r#"city,country,pop
-"Boston","United, States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United, States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn quote_in_field() {
-        let documents = r#"city,country,pop
-"Boston","United"" States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United\" States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn integer_in_field() {
-        let documents = r#"city,country,pop:number
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United States",
-                "pop": 4628910.0,
-            })
-        );
-    }
-
-    #[test]
-    fn float_in_field() {
-        let documents = r#"city,country,pop:number
-"Boston","United States","4628910.01""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city": "Boston",
-                "country": "United States",
-                "pop": 4628910.01,
-            })
-        );
-    }
-
-    #[test]
-    fn several_double_dot_in_header() {
-        let documents = r#"city:love:string,country:state,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city:love": "Boston",
-                "country:state": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn ending_by_double_dot_in_header() {
-        let documents = r#"city:,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city:": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn starting_by_double_dot_in_header() {
-        let documents = r#":city,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                ":city": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn starting_by_double_dot_in_header2() {
-        let documents = r#":string,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn double_double_dot_in_header() {
-        let documents = r#"city::string,country,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert_eq!(
-            Value::Object(csv_iter.next().unwrap().unwrap()),
-            json!({
-                "city:": "Boston",
-                "country": "United States",
-                "pop": "4628910",
-            })
-        );
-    }
-
-    #[test]
-    fn bad_type_in_header() {
-        let documents = r#"city,country:number,pop
-"Boston","United States","4628910""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert!(csv_iter.next().unwrap().is_err());
-    }
-
-    #[test]
-    fn bad_column_count1() {
-        let documents = r#"city,country,pop
-"Boston","United States","4628910", "too much""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert!(csv_iter.next().unwrap().is_err());
-    }
-
-    #[test]
-    fn bad_column_count2() {
-        let documents = r#"city,country,pop
-"Boston","United States""#;
-
-        let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap();
-
-        assert!(csv_iter.next().unwrap().is_err());
-    }
-}
diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs
index d27c6d5bb..9e9fe4a2b 100644
--- a/http-ui/src/main.rs
+++ b/http-ui/src/main.rs
@@ -1,10 +1,9 @@
-mod documents_from_csv;
 mod update_store;
 
 use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet};
 use std::fmt::Display;
 use std::fs::{create_dir_all, File};
-use std::io::Cursor;
+use std::io::{BufRead, BufReader, Cursor};
 use std::net::SocketAddr;
 use std::num::{NonZeroU32, NonZeroUsize};
 use std::path::PathBuf;
@@ -39,7 +38,6 @@ use warp::http::Response;
 use warp::Filter;
 
 use self::update_store::UpdateStore;
-use crate::documents_from_csv::CSVDocumentDeserializer;
 
 #[cfg(target_os = "linux")]
 #[global_allocator]
@@ -1041,11 +1039,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
 
-    let values = serde_json::Deserializer::from_reader(reader)
-        .into_iter::<serde_json::Map<String, serde_json::Value>>();
-    for document in values {
-        let document = document?;
-        documents.add_documents(document)?;
+    let mut buf = String::new();
+    let mut reader = BufReader::new(reader);
+
+    while reader.read_line(&mut buf)? > 0 {
+        documents.extend_from_json(&mut buf.as_bytes())?;
     }
     documents.finish()?;
 
@@ -1056,8 +1054,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
     let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
 
-    let json: serde_json::Value = serde_json::from_reader(reader)?;
-    documents.add_documents(json)?;
+    documents.extend_from_json(reader)?;
     documents.finish()?;
 
     Ok(writer.into_inner())
@@ -1065,16 +1062,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
 
 fn documents_from_csv(reader: impl io::Read) -> anyhow::Result<Vec<u8>> {
     let mut writer = Cursor::new(Vec::new());
-    let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?;
-
-    let iter = CSVDocumentDeserializer::from_reader(reader)?;
-
-    for doc in iter {
-        let doc = doc?;
-        documents.add_documents(doc)?;
-    }
-
-    documents.finish()?;
+    milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?;
 
     Ok(writer.into_inner())
 }
diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index c2b0e01cc..6ba890b79 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -1,16 +1,14 @@
 use std::collections::BTreeMap;
 use std::io;
-use std::io::Cursor;
-use std::io::Write;
+use std::io::{Cursor, Write};
 
 use byteorder::{BigEndian, WriteBytesExt};
 use serde::Deserializer;
 use serde_json::Value;
 
-use crate::FieldId;
-
 use super::serde::DocumentVisitor;
 use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
+use crate::FieldId;
 
 /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary
 /// format used by milli.
@@ -27,7 +25,7 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error};
 /// let json = r##"{"id": 1, "name": "foo"}"##;
 /// let mut writer = Cursor::new(Vec::new());
 /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap();
-/// builder.extend_from_json(Cursor::new(json.as_bytes())).unwrap();
+/// builder.extend_from_json(&mut json.as_bytes()).unwrap();
 /// builder.finish().unwrap();
 /// ```
 pub struct DocumentBatchBuilder<W> {
@@ -46,16 +44,14 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
         // add space to write the offset of the metadata at the end of the writer
         writer.write_u64::<BigEndian>(0)?;
 
-        let this = Self {
+        Ok(Self {
             inner: writer,
             index,
             obkv_buffer: Vec::new(),
             value_buffer: Vec::new(),
             values: BTreeMap::new(),
             count: 0,
-        };
-
-        Ok(this)
+        })
     }
 
     /// Returns the number of documents that have been written to the builder.
@@ -117,27 +113,31 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
 
         for (i, record) in records.into_records().enumerate() {
             let record = record?;
-            let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer));
+            this.obkv_buffer.clear();
+            let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer);
             for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) {
                 let value = match ty {
-                    AllowedType::Number => value.parse::<f64>().map(Value::from).map_err(|error| Error::ParseFloat {
-                        error,
-                        // +1 for the header offset.
-                        line: i + 1,
-                        value: value.to_string(),
-                    })?,
+                    AllowedType::Number => {
+                        value.parse::<f64>().map(Value::from).map_err(|error| {
+                            Error::ParseFloat {
+                                error,
+                                // +1 for the header offset.
+                                line: i + 1,
+                                value: value.to_string(),
+                            }
+                        })?
+                    }
                     AllowedType::String => Value::String(value.to_string()),
                 };
 
+                this.value_buffer.clear();
                 serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?;
                 writer.insert(*fid, &this.value_buffer)?;
-                this.value_buffer.clear();
             }
 
             this.inner.write_u32::<BigEndian>(this.obkv_buffer.len() as u32)?;
             this.inner.write_all(&this.obkv_buffer)?;
 
-            this.obkv_buffer.clear();
             this.count += 1;
         }
 
@@ -156,7 +156,8 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) {
     match header.rsplit_once(':') {
         Some((field_name, field_type)) => match field_type {
             "string" => (field_name.to_string(), AllowedType::String),
-            "number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field.
+            "number" => (field_name.to_string(), AllowedType::Number),
+            // if the pattern isn't reconized, we keep the whole field.
             _otherwise => (header.to_string(), AllowedType::String),
         },
         None => (header.to_string(), AllowedType::String),
@@ -169,9 +170,8 @@ mod test {
 
     use serde_json::{json, Map};
 
-    use crate::documents::DocumentBatchReader;
-
     use super::*;
+    use crate::documents::DocumentBatchReader;
 
     fn obkv_to_value(obkv: &obkv::KvReader<FieldId>, index: &DocumentsBatchIndex) -> Value {
         let mut map = Map::new();
@@ -525,7 +525,9 @@ mod test {
 "Boston","United States","4628910""#;
 
         let mut buf = Vec::new();
-        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
+        assert!(
+            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
+        );
     }
 
     #[test]
@@ -534,7 +536,9 @@ mod test {
 "Boston","United States","4628910", "too much""#;
 
         let mut buf = Vec::new();
-        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
+        assert!(
+            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
+        );
     }
 
     #[test]
@@ -543,6 +547,8 @@ mod test {
 "Boston","United States""#;
 
         let mut buf = Vec::new();
-        assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err());
+        assert!(
+            DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()
+        );
     }
 }
diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs
index 6f653d825..14d97ee7d 100644
--- a/milli/src/documents/mod.rs
+++ b/milli/src/documents/mod.rs
@@ -7,9 +7,8 @@ mod builder;
 mod reader;
 mod serde;
 
-use std::num::ParseFloatError;
-use std::io;
 use std::fmt::{self, Debug};
+use std::io;
 
 use ::serde::{Deserialize, Serialize};
 use bimap::BiHashMap;
@@ -24,7 +23,7 @@ pub struct DocumentsBatchIndex(pub BiHashMap<FieldId, String>);
 
 impl DocumentsBatchIndex {
     /// Insert the field in the map, or return it's field id if it doesn't already exists.
-    pub fn insert(&mut self, field:  &str) -> FieldId {
+    pub fn insert(&mut self, field: &str) -> FieldId {
         match self.0.get_by_right(field) {
             Some(field_id) => *field_id,
             None => {
@@ -43,7 +42,7 @@ impl DocumentsBatchIndex {
         self.0.len()
     }
 
-    pub fn iter(&self) -> impl Iterator<Item=(&FieldId, &String)> {
+    pub fn iter(&self) -> bimap::hash::Iter<FieldId, String> {
         self.0.iter()
     }
 
@@ -83,11 +82,7 @@ impl<W: io::Write> io::Write for ByteCounter<W> {
 
 #[derive(Debug)]
 pub enum Error {
-    ParseFloat {
-        error: std::num::ParseFloatError,
-        line: usize,
-        value: String,
-    },
+    ParseFloat { error: std::num::ParseFloatError, line: usize, value: String },
     InvalidDocumentFormat,
     Custom(String),
     JsonError(serde_json::Error),
@@ -124,7 +119,9 @@ impl From<serde_json::Error> for Error {
 impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         match self {
-            Error::ParseFloat { error, line, value} => write!(f, "Error parsing number {:?} at line {}: {}", value, line, error),
+            Error::ParseFloat { error, line, value } => {
+                write!(f, "Error parsing number {:?} at line {}: {}", value, line, error)
+            }
             Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s),
             Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."),
             Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err),
diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs
index 4dfdca2c7..d57bf1ffb 100644
--- a/milli/src/documents/serde.rs
+++ b/milli/src/documents/serde.rs
@@ -1,18 +1,13 @@
 use std::collections::BTreeMap;
-use std::io::Cursor;
-use std::io::Write;
 use std::fmt;
+use std::io::{Cursor, Write};
 
 use byteorder::WriteBytesExt;
+use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor};
 use serde::Deserialize;
-use serde::de::DeserializeSeed;
-use serde::de::MapAccess;
-use serde::de::SeqAccess;
-use serde::de::Visitor;
 use serde_json::Value;
 
-use super::Error;
-use super::{ByteCounter, DocumentsBatchIndex};
+use super::{ByteCounter, DocumentsBatchIndex, Error};
 use crate::FieldId;
 
 macro_rules! tri {
@@ -31,8 +26,9 @@ impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> {
 
     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
     where
-        D: serde::Deserializer<'de> {
-            deserializer.deserialize_str(self)
+        D: serde::Deserializer<'de>,
+    {
+        deserializer.deserialize_str(self)
     }
 }
 
@@ -43,7 +39,7 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> {
     where
         E: serde::de::Error,
     {
-            Ok(self.0.insert(v))
+        Ok(self.0.insert(v))
     }
 
     fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result {
@@ -58,8 +54,9 @@ impl<'de> DeserializeSeed<'de> for ValueDeserializer {
 
     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
     where
-        D: serde::Deserializer<'de> {
-            serde_json::Value::deserialize(deserializer)
+        D: serde::Deserializer<'de>,
+    {
+        serde_json::Value::deserialize(deserializer)
     }
 }
 
@@ -80,7 +77,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
     where
         A: SeqAccess<'de>,
     {
-        while let Some(v) = seq.next_element_seed(&mut *self)? { tri!(v) }
+        while let Some(v) = seq.next_element_seed(&mut *self)? {
+            tri!(v)
+        }
 
         Ok(Ok(()))
     }
@@ -89,7 +88,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
     where
         A: MapAccess<'de>,
     {
-        while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? {
+        while let Some((key, value)) =
+            map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)?
+        {
             self.values.insert(key, value);
         }
 
@@ -119,13 +120,15 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> {
 }
 
 impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W>
-where W: Write,
+where
+    W: Write,
 {
     type Value = Result<(), Error>;
 
     fn deserialize<D>(self, deserializer: D) -> Result<Self::Value, D::Error>
     where
-        D: serde::Deserializer<'de> {
-            deserializer.deserialize_map(self)
+        D: serde::Deserializer<'de>,
+    {
+        deserializer.deserialize_map(self)
     }
 }
diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs
index 17c778060..440546b10 100644
--- a/milli/src/update/index_documents/mod.rs
+++ b/milli/src/update/index_documents/mod.rs
@@ -906,8 +906,9 @@ mod tests {
 
         let mut cursor = Cursor::new(Vec::new());
 
+        let big_object = serde_json::to_string(&big_object).unwrap();
         let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap();
-        builder.add_documents(big_object).unwrap();
+        builder.extend_from_json(&mut big_object.as_bytes()).unwrap();
         builder.finish().unwrap();
         cursor.set_position(0);
         let content = DocumentBatchReader::from_reader(cursor).unwrap();
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index eac60effb..08aa72d35 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -544,7 +544,8 @@ mod test {
     mod primary_key_inference {
         use bimap::BiHashMap;
 
-        use crate::{documents::DocumentsBatchIndex, update::index_documents::transform::find_primary_key};
+        use crate::documents::DocumentsBatchIndex;
+        use crate::update::index_documents::transform::find_primary_key;
 
         #[test]
         fn primary_key_infered_on_first_field() {

From 9b8ab40d80c0fdf6cfeb95270ebd7bdbb4cf386a Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Tue, 26 Oct 2021 11:35:49 +0200
Subject: [PATCH 12/22] remove search folder

---
 search/Cargo.toml | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 search/Cargo.toml

diff --git a/search/Cargo.toml b/search/Cargo.toml
deleted file mode 100644
index d9441984e..000000000
--- a/search/Cargo.toml
+++ /dev/null
@@ -1,18 +0,0 @@
-[package]
-name = "search"
-version = "0.19.0"
-authors = ["Clément Renault <clement@meilisearch.com>"]
-edition = "2018"
-
-[dependencies]
-anyhow = "1.0.38"
-byte-unit = { version = "4.0.9", default-features = false, features = ["std"] }
-heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" }
-log = "0.4.14"
-milli = { path = "../milli" }
-serde_json = "1.0.62"
-stderrlog = "0.5.1"
-structopt = { version = "0.3.21", default-features = false }
-
-[target.'cfg(target_os = "linux")'.dependencies]
-jemallocator = "0.3.2"

From 3599df77f05f4e89a28c0160411e95a840e0b227 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Tue, 26 Oct 2021 17:49:35 +0200
Subject: [PATCH 13/22] Change some error messages

---
 milli/src/asc_desc.rs                         |  12 +-
 milli/src/criterion.rs                        |  14 ++-
 milli/src/error.rs                            | 112 ++++++++++++------
 milli/src/search/facet/filter_condition.rs    | 104 ++++++----------
 milli/src/update/index_documents/transform.rs |   6 +-
 milli/src/update/settings.rs                  |   8 +-
 6 files changed, 138 insertions(+), 118 deletions(-)

diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs
index 00f65a459..c0a277c0c 100644
--- a/milli/src/asc_desc.rs
+++ b/milli/src/asc_desc.rs
@@ -28,12 +28,12 @@ impl fmt::Display for AscDescError {
                 write!(f, "Longitude must be contained between -180 and 180 degrees.",)
             }
             Self::InvalidSyntax { name } => {
-                write!(f, "invalid asc/desc syntax for {}.", name)
+                write!(f, "invalid asc/desc syntax for `{}`.", name)
             }
             Self::ReservedKeyword { name } => {
                 write!(
                     f,
-                    "{} is a reserved keyword and thus can't be used as a asc/desc rule.",
+                    "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.",
                     name
                 )
             }
@@ -192,18 +192,18 @@ impl fmt::Display for SortError {
             Self::BadGeoPointUsage { name } => {
                 write!(
                     f,
-                    "invalid syntax for the `_geoPoint` parameter: `{}`. \
+                    "Invalid syntax for the `_geoPoint` parameter: `{}`. \
                     Usage: `_geoPoint(latitude, longitude):asc`.",
                     name
                 )
             }
             Self::InvalidName { name } => {
-                write!(f, "invalid syntax for the sort parameter `{}`.", name)
+                write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
             }
             Self::ReservedName { name } => {
                 write!(
                     f,
-                    "{} is a reserved keyword and thus can't be used as a sort expression.",
+                    "`{}` is a reserved keyword and thus can't be used as a sort expression.",
                     name
                 )
             }
@@ -211,7 +211,7 @@ impl fmt::Display for SortError {
                 write!(
                     f,
                     "`{}` is a reserved keyword and thus can't be used as a sort expression. \
-                    Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.",
+                    Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.",
                     name,
                 )
             }
diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs
index aff7fcf68..0586fcc0f 100644
--- a/milli/src/criterion.rs
+++ b/milli/src/criterion.rs
@@ -19,21 +19,25 @@ impl fmt::Display for CriterionError {
         match self {
             Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name),
             Self::ReservedName { name } => {
-                write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name)
+                write!(
+                    f,
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule",
+                    name
+                )
             }
             Self::ReservedNameForSort { name } => {
                 write!(
                     f,
-                    "{} is a reserved keyword and thus can't be used as a ranking rule. \
-{} can only be used for sorting at search time",
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule. \
+`{}` can only be used for sorting at search time",
                     name, name
                 )
             }
             Self::ReservedNameForFilter { name } => {
                 write!(
                     f,
-                    "{} is a reserved keyword and thus can't be used as a ranking rule. \
-{} can only be used for filtering at search time",
+                    "`{}` is a reserved keyword and thus can't be used as a ranking rule. \
+`{}` can only be used for filtering at search time",
                     name, name
                 )
             }
diff --git a/milli/src/error.rs b/milli/src/error.rs
index 1f1cc5264..a4125d117 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -59,23 +59,28 @@ pub enum UserError {
     DocumentLimitReached,
     InvalidDocumentId { document_id: Value },
     InvalidFacetsDistribution { invalid_facets_name: HashSet<String> },
-    InvalidFilter(pest::error::Error<ParserRule>),
-    InvalidFilterAttribute(pest::error::Error<ParserRule>),
+    InvalidFilter(FilterError),
     InvalidGeoField { document_id: Value, object: Value },
     InvalidSortableAttribute { field: String, valid_fields: HashSet<String> },
     SortRankingRuleMissing,
     InvalidStoreFile,
     MaxDatabaseSizeReached,
-    MissingDocumentId { document: Object },
+    MissingDocumentId { primary_key: String, document: Object },
     MissingPrimaryKey,
     NoSpaceLeftOnDevice,
-    PrimaryKeyCannotBeChanged,
-    PrimaryKeyCannotBeReset,
+    PrimaryKeyCannotBeChanged(String),
     SerdeJson(serde_json::Error),
     SortError(SortError),
     UnknownInternalDocumentId { document_id: DocumentId },
 }
 
+#[derive(Debug)]
+pub enum FilterError {
+    InvalidAttribute { field: String, valid_fields: HashSet<String> },
+    ReservedKeyword { field: String, context: Option<String> },
+    Syntax(pest::error::Error<ParserRule>),
+}
+
 impl From<io::Error> for Error {
     fn from(error: io::Error) -> Error {
         // TODO must be improved and more precise
@@ -160,6 +165,12 @@ impl From<UserError> for Error {
     }
 }
 
+impl From<FilterError> for Error {
+    fn from(error: FilterError) -> Error {
+        Error::UserError(UserError::InvalidFilter(error))
+    }
+}
+
 impl From<SerializationError> for Error {
     fn from(error: SerializationError) -> Error {
         Error::InternalError(InternalError::Serialization(error))
@@ -169,7 +180,7 @@ impl From<SerializationError> for Error {
 impl fmt::Display for Error {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            Self::InternalError(error) => write!(f, "internal: {}", error),
+            Self::InternalError(error) => write!(f, "internal: {}.", error),
             Self::IoError(error) => error.fmt(f),
             Self::UserError(error) => error.fmt(f),
         }
@@ -182,15 +193,15 @@ impl fmt::Display for InternalError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
             Self::DatabaseMissingEntry { db_name, key } => {
-                write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name)
+                write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name)
             }
             Self::FieldIdMapMissingEntry(error) => error.fmt(f),
             Self::Fst(error) => error.fmt(f),
             Self::GrenadInvalidCompressionType => {
-                f.write_str("invalid compression type have been specified to grenad")
+                f.write_str("Invalid compression type have been specified to grenad.")
             }
             Self::IndexingMergingKeys { process } => {
-                write!(f, "invalid merge while processing {}", process)
+                write!(f, "Invalid merge while processing {}.", process)
             }
             Self::Serialization(error) => error.fmt(f),
             Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f),
@@ -208,67 +219,100 @@ impl StdError for InternalError {}
 impl fmt::Display for UserError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"),
+            Self::AttributeLimitReached => f.write_str("Maximum number of attributes reached."),
             Self::CriterionError(error) => write!(f, "{}", error),
-            Self::DocumentLimitReached => f.write_str("maximum number of documents reached"),
+            Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."),
             Self::InvalidFacetsDistribution { invalid_facets_name } => {
                 let name_list =
                     invalid_facets_name.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
                 write!(
                     f,
-                    "invalid facet distribution, the fields {} are not set as filterable",
+                    "Invalid facet distribution, the fields `{}` are not set as filterable.",
                     name_list
                 )
             }
             Self::InvalidFilter(error) => error.fmt(f),
             Self::InvalidGeoField { document_id, object } => write!(
                 f,
-                "the document with the id: {} contains an invalid _geo field: {}",
+                "The document with the id: `{}` contains an invalid _geo field: `{}`.",
                 document_id, object
             ),
             Self::InvalidDocumentId { document_id } => {
-                let json = serde_json::to_string(document_id).unwrap();
+                let document_id = match document_id {
+                    Value::String(id) => id.clone(),
+                    _ => document_id.to_string(),
+                };
                 write!(
                     f,
-                    "document identifier is invalid {}, \
-a document id can be of type integer or string \
-only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)",
-                    json
+                    "Document identifier `{}` is invalid. \
+A document identifier can be of type integer or string, \
+only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).",
+                    document_id
                 )
             }
-            Self::InvalidFilterAttribute(error) => error.fmt(f),
             Self::InvalidSortableAttribute { field, valid_fields } => {
                 let valid_names =
                     valid_fields.iter().map(AsRef::as_ref).collect::<Vec<_>>().join(", ");
                 write!(
                     f,
-                    "Attribute {} is not sortable, available sortable attributes are: {}",
+                    "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.",
                     field, valid_names
                 )
             }
             Self::SortRankingRuleMissing => f.write_str(
-                "You must specify where \"sort\" is listed in the \
-rankingRules setting to use the sort parameter at search time",
+                "The sort ranking rule must be specified in the \
+ranking rules settings to use the sort parameter at search time.",
             ),
-            Self::MissingDocumentId { document } => {
+            Self::MissingDocumentId { primary_key, document } => {
                 let json = serde_json::to_string(document).unwrap();
-                write!(f, "document doesn't have an identifier {}", json)
+                write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json)
             }
-            Self::MissingPrimaryKey => f.write_str("missing primary key"),
-            Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"),
+            Self::MissingPrimaryKey => f.write_str("Missing primary key."),
+            Self::MaxDatabaseSizeReached => f.write_str("Maximum database size reached."),
             // TODO where can we find it instead of writing the text ourselves?
-            Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"),
-            Self::InvalidStoreFile => f.write_str("store file is not a valid database file"),
-            Self::PrimaryKeyCannotBeChanged => {
-                f.write_str("primary key cannot be changed if the database contains documents")
-            }
-            Self::PrimaryKeyCannotBeReset => {
-                f.write_str("primary key cannot be reset if the database contains documents")
+            Self::NoSpaceLeftOnDevice => f.write_str("No space left on device."),
+            Self::InvalidStoreFile => f.write_str("Store file is not a valid database file."),
+            Self::PrimaryKeyCannotBeChanged(primary_key) => {
+                write!(f, "Index already has a primary key: `{}`.", primary_key)
             }
             Self::SerdeJson(error) => error.fmt(f),
             Self::SortError(error) => write!(f, "{}", error),
             Self::UnknownInternalDocumentId { document_id } => {
-                write!(f, "an unknown internal document id have been used ({})", document_id)
+                write!(f, "An unknown internal document id have been used: `{}`.", document_id)
+            }
+        }
+    }
+}
+
+impl fmt::Display for FilterError {
+    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
+        match self {
+            Self::InvalidAttribute { field, valid_fields } => write!(
+                f,
+                "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.",
+                field,
+                valid_fields
+                    .clone()
+                    .into_iter()
+                    .reduce(|left, right| left + "`, `" + &right)
+                    .unwrap_or_default()
+            ),
+            Self::ReservedKeyword { field, context: Some(context) } => {
+                write!(
+                    f,
+                    "`{}` is a reserved keyword and thus can't be used as a filter expression. {}",
+                    field, context
+                )
+            }
+            Self::ReservedKeyword { field, context: None } => {
+                write!(
+                    f,
+                    "`{}` is a reserved keyword and thus can't be used as a filter expression.",
+                    field
+                )
+            }
+            Self::Syntax(syntax_helper) => {
+                write!(f, "Invalid syntax for the filter parameter: `{}`.", syntax_helper)
             }
         }
     }
diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs
index f1055b2f8..3378054d4 100644
--- a/milli/src/search/facet/filter_condition.rs
+++ b/milli/src/search/facet/filter_condition.rs
@@ -6,7 +6,6 @@ use std::str::FromStr;
 
 use either::Either;
 use heed::types::DecodeIgnore;
-use itertools::Itertools;
 use log::debug;
 use pest::error::{Error as PestError, ErrorVariant};
 use pest::iterators::{Pair, Pairs};
@@ -17,7 +16,7 @@ use self::FilterCondition::*;
 use self::Operator::*;
 use super::parser::{FilterParser, Rule, PREC_CLIMBER};
 use super::FacetNumberRange;
-use crate::error::UserError;
+use crate::error::FilterError;
 use crate::heed_codec::facet::{
     FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec,
 };
@@ -117,8 +116,7 @@ impl FilterCondition {
     ) -> Result<FilterCondition> {
         let fields_ids_map = index.fields_ids_map(rtxn)?;
         let filterable_fields = index.filterable_fields(rtxn)?;
-        let lexed =
-            FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?;
+        let lexed = FilterParser::parse(Rule::prgm, expression).map_err(FilterError::Syntax)?;
         FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed)
     }
 
@@ -169,15 +167,11 @@ impl FilterCondition {
         item: Pair<Rule>,
     ) -> Result<FilterCondition> {
         if !filterable_fields.contains("_geo") {
-            return Err(UserError::InvalidFilterAttribute(PestError::new_from_span(
-                ErrorVariant::CustomError {
-                    message: format!(
-                    "attribute `_geo` is not filterable, available filterable attributes are: {}",
-                    filterable_fields.iter().join(", "),
-                ),
-                },
-                item.as_span(),
-            )))?;
+            return Err(FilterError::InvalidAttribute {
+                field: "_geo".to_string(),
+                valid_fields: filterable_fields.clone(),
+            }
+            .into());
         }
         let mut items = item.into_inner();
         let fid = match fields_ids_map.id("_geo") {
@@ -194,27 +188,27 @@ impl FilterCondition {
             .map(|param| (param.clone(), param.as_span()))
             .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span)))
             .collect::<StdResult<Vec<(f64, _)>, _>>()
-            .map_err(UserError::InvalidFilter)?;
+            .map_err(FilterError::Syntax)?;
         if parameters.len() != 3 {
-            return Err(UserError::InvalidFilter(PestError::new_from_span(
+            return Err(FilterError::Syntax(PestError::new_from_span(
                         ErrorVariant::CustomError {
                             message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"),
                         },
                         // we want to point to the last parameters and if there was no parameters we
                         // point to the parenthesis
                         parameters.last().map(|param| param.1.clone()).unwrap_or(param_span),
-            )))?;
+            )).into());
         }
         let (lat, lng, distance) = (&parameters[0], &parameters[1], parameters[2].0);
         if !(-90.0..=90.0).contains(&lat.0) {
-            return Err(UserError::InvalidFilter(PestError::new_from_span(
+            return Err(FilterError::Syntax(PestError::new_from_span(
                 ErrorVariant::CustomError {
                     message: format!("Latitude must be contained between -90 and 90 degrees."),
                 },
                 lat.1.clone(),
             )))?;
         } else if !(-180.0..=180.0).contains(&lng.0) {
-            return Err(UserError::InvalidFilter(PestError::new_from_span(
+            return Err(FilterError::Syntax(PestError::new_from_span(
                 ErrorVariant::CustomError {
                     message: format!("Longitude must be contained between -180 and 180 degrees."),
                 },
@@ -230,9 +224,7 @@ impl FilterCondition {
         item: Pair<Rule>,
     ) -> Result<FilterCondition> {
         let mut items = item.into_inner();
-        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
-            .map_err(UserError::InvalidFilterAttribute)?
-        {
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? {
             Some(fid) => fid,
             None => return Ok(Empty),
         };
@@ -240,8 +232,8 @@ impl FilterCondition {
         let (lresult, _) = pest_parse(items.next().unwrap());
         let (rresult, _) = pest_parse(items.next().unwrap());
 
-        let lvalue = lresult.map_err(UserError::InvalidFilter)?;
-        let rvalue = rresult.map_err(UserError::InvalidFilter)?;
+        let lvalue = lresult.map_err(FilterError::Syntax)?;
+        let rvalue = rresult.map_err(FilterError::Syntax)?;
 
         Ok(Operator(fid, Between(lvalue, rvalue)))
     }
@@ -252,9 +244,7 @@ impl FilterCondition {
         item: Pair<Rule>,
     ) -> Result<FilterCondition> {
         let mut items = item.into_inner();
-        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
-            .map_err(UserError::InvalidFilterAttribute)?
-        {
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? {
             Some(fid) => fid,
             None => return Ok(Empty),
         };
@@ -272,16 +262,14 @@ impl FilterCondition {
         item: Pair<Rule>,
     ) -> Result<FilterCondition> {
         let mut items = item.into_inner();
-        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
-            .map_err(UserError::InvalidFilterAttribute)?
-        {
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? {
             Some(fid) => fid,
             None => return Ok(Empty),
         };
 
         let value = items.next().unwrap();
         let (result, _svalue) = pest_parse(value);
-        let value = result.map_err(UserError::InvalidFilter)?;
+        let value = result.map_err(FilterError::Syntax)?;
 
         Ok(Operator(fid, GreaterThan(value)))
     }
@@ -292,16 +280,14 @@ impl FilterCondition {
         item: Pair<Rule>,
     ) -> Result<FilterCondition> {
         let mut items = item.into_inner();
-        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
-            .map_err(UserError::InvalidFilterAttribute)?
-        {
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? {
             Some(fid) => fid,
             None => return Ok(Empty),
         };
 
         let value = items.next().unwrap();
         let (result, _svalue) = pest_parse(value);
-        let value = result.map_err(UserError::InvalidFilter)?;
+        let value = result.map_err(FilterError::Syntax)?;
 
         Ok(Operator(fid, GreaterThanOrEqual(value)))
     }
@@ -312,16 +298,14 @@ impl FilterCondition {
         item: Pair<Rule>,
     ) -> Result<FilterCondition> {
         let mut items = item.into_inner();
-        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
-            .map_err(UserError::InvalidFilterAttribute)?
-        {
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? {
             Some(fid) => fid,
             None => return Ok(Empty),
         };
 
         let value = items.next().unwrap();
         let (result, _svalue) = pest_parse(value);
-        let value = result.map_err(UserError::InvalidFilter)?;
+        let value = result.map_err(FilterError::Syntax)?;
 
         Ok(Operator(fid, LowerThan(value)))
     }
@@ -332,16 +316,14 @@ impl FilterCondition {
         item: Pair<Rule>,
     ) -> Result<FilterCondition> {
         let mut items = item.into_inner();
-        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)
-            .map_err(UserError::InvalidFilterAttribute)?
-        {
+        let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? {
             Some(fid) => fid,
             None => return Ok(Empty),
         };
 
         let value = items.next().unwrap();
         let (result, _svalue) = pest_parse(value);
-        let value = result.map_err(UserError::InvalidFilter)?;
+        let value = result.map_err(FilterError::Syntax)?;
 
         Ok(Operator(fid, LowerThanOrEqual(value)))
     }
@@ -598,43 +580,27 @@ fn field_id(
     fields_ids_map: &FieldsIdsMap,
     filterable_fields: &HashSet<String>,
     items: &mut Pairs<Rule>,
-) -> StdResult<Option<FieldId>, PestError<Rule>> {
+) -> StdResult<Option<FieldId>, FilterError> {
     // lexing ensures that we at least have a key
     let key = items.next().unwrap();
     if key.as_rule() == Rule::reserved {
-        let message = match key.as_str() {
+        return match key.as_str() {
             key if key.starts_with("_geoPoint") => {
-                format!(
-                    "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \
-                    Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.",
-                )
+                Err(FilterError::ReservedKeyword { field: "_geoPoint".to_string(), context: Some("Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.".to_string()) })
             }
-            key @ "_geo" => {
-                format!(
-                    "`{}` is a reserved keyword and thus can't be used as a filter expression. \
-                    Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.",
-                    key
-                )
+            "_geo" => {
+                Err(FilterError::ReservedKeyword { field: "_geo".to_string(), context: Some("Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.".to_string()) })
             }
-            key => format!(
-                "`{}` is a reserved keyword and thus can't be used as a filter expression.",
-                key
-            ),
+            key =>
+                Err(FilterError::ReservedKeyword { field: key.to_string(), context: None }),
         };
-        return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span()));
     }
 
     if !filterable_fields.contains(key.as_str()) {
-        return Err(PestError::new_from_span(
-            ErrorVariant::CustomError {
-                message: format!(
-                    "attribute `{}` is not filterable, available filterable attributes are: {}.",
-                    key.as_str(),
-                    filterable_fields.iter().join(", "),
-                ),
-            },
-            key.as_span(),
-        ));
+        return Err(FilterError::InvalidAttribute {
+            field: key.as_str().to_string(),
+            valid_fields: filterable_fields.clone(),
+        });
     }
 
     Ok(fields_ids_map.id(key.as_str()))
diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs
index 08aa72d35..855fb8db9 100644
--- a/milli/src/update/index_documents/transform.rs
+++ b/milli/src/update/index_documents/transform.rs
@@ -187,7 +187,11 @@ impl Transform<'_, '_> {
                                 }
                             }
 
-                            return Err(UserError::MissingDocumentId { document: json }.into());
+                            return Err(UserError::MissingDocumentId {
+                                primary_key: primary_key_name,
+                                document: json,
+                            }
+                            .into());
                         }
 
                         let uuid =
diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs
index dee63c726..94875a079 100644
--- a/milli/src/update/settings.rs
+++ b/milli/src/update/settings.rs
@@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                     self.index.put_primary_key(self.wtxn, primary_key)?;
                     Ok(())
                 } else {
-                    Err(UserError::PrimaryKeyCannotBeChanged.into())
+                    let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
+                    Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
                 }
             }
             Setting::Reset => {
@@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> {
                     self.index.delete_primary_key(self.wtxn)?;
                     Ok(())
                 } else {
-                    Err(UserError::PrimaryKeyCannotBeReset.into())
+                    let primary_key = self.index.primary_key(self.wtxn)?.unwrap();
+                    Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into())
                 }
             }
             Setting::NotSet => Ok(()),
@@ -1105,7 +1107,7 @@ mod tests {
         builder.reset_primary_key();
 
         let err = builder.execute(|_, _| ()).unwrap_err();
-        assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset)));
+        assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_))));
         wtxn.abort().unwrap();
 
         // But if we clear the database...

From 2be755ce75d8b1e64c11bf1801a1f3474beb3362 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 27 Oct 2021 19:50:41 +0200
Subject: [PATCH 14/22] Lower error check, already check in meilisearch

---
 milli/src/asc_desc.rs                      | 46 ----------------------
 milli/src/search/facet/filter_condition.rs | 43 ++------------------
 2 files changed, 4 insertions(+), 85 deletions(-)

diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs
index c0a277c0c..8d4973c2f 100644
--- a/milli/src/asc_desc.rs
+++ b/milli/src/asc_desc.rs
@@ -318,50 +318,4 @@ mod tests {
             );
         }
     }
-
-    #[test]
-    fn sort_error_message() {
-        let errors = [
-            (
-                AscDescError::InvalidSyntax { name: S("truc:machin") },
-                S("invalid syntax for the sort parameter `truc:machin`."),
-            ),
-            (
-                AscDescError::InvalidSyntax { name: S("hello:world") },
-                S("invalid syntax for the sort parameter `hello:world`."),
-            ),
-            (
-                AscDescError::ReservedKeyword { name: S("_geo") },
-                S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
-            ),
-            (
-                AscDescError::ReservedKeyword { name: S("_geoDistance") },
-                S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.")
-            ),
-            (
-                AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") },
-                S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."),
-            ),
-            (
-                AscDescError::InvalidLatitude,
-                S("Latitude must be contained between -90 and 90 degrees."),
-            ),
-            (
-                AscDescError::InvalidLongitude,
-                S("Longitude must be contained between -180 and 180 degrees."),
-            ),
-        ];
-
-        for (asc_desc_error, expected_message) in errors {
-            let sort_error = SortError::from(asc_desc_error);
-            assert_eq!(
-                sort_error.to_string(),
-                expected_message,
-                "was expecting {} for the error {:?} but instead got {}",
-                expected_message,
-                sort_error,
-                sort_error.to_string()
-            );
-        }
-    }
 }
diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs
index 3378054d4..f8d40aefb 100644
--- a/milli/src/search/facet/filter_condition.rs
+++ b/milli/src/search/facet/filter_condition.rs
@@ -755,39 +755,13 @@ mod tests {
         let index = Index::new(options, &path).unwrap();
         let rtxn = index.read_txn().unwrap();
 
-        let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."),
-            "{}",
-            error.to_string()
-        );
+        assert!(FilterCondition::from_str(&rtxn, &index, "_geo = 12").is_err());
 
-        let error =
-            FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."),
-            "{}",
-            error.to_string()
-        );
+        assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).is_err());
 
-        let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."),
-            "{}",
-            error.to_string()
-        );
+        assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).is_err());
 
-        let error =
-            FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."),
-            "{}",
-            error.to_string()
-        );
+        assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).is_err());
     }
 
     #[test]
@@ -804,15 +778,6 @@ mod tests {
         builder.execute(|_, _| ()).unwrap();
         wtxn.commit().unwrap();
 
-        let rtxn = index.read_txn().unwrap();
-        // _geo is not filterable
-        let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)");
-        assert!(result.is_err());
-        let error = result.unwrap_err();
-        assert!(error
-            .to_string()
-            .contains("attribute `_geo` is not filterable, available filterable attributes are:"),);
-
         let mut wtxn = index.write_txn().unwrap();
         let mut builder = Settings::new(&mut wtxn, &index, 0);
         builder.set_filterable_fields(hashset! { S("_geo"), S("price") });

From 183d3dada7d50c2bc8a129805e8dfae4bb3255dd Mon Sep 17 00:00:00 2001
From: marin postma <postma.marin@protonmail.com>
Date: Thu, 28 Oct 2021 10:33:04 +0200
Subject: [PATCH 15/22] return document count from builder

---
 milli/src/documents/builder.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs
index 6ba890b79..f95fa9190 100644
--- a/milli/src/documents/builder.rs
+++ b/milli/src/documents/builder.rs
@@ -62,7 +62,7 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
     /// This method must be called after the document addition is terminated. It will put the
     /// metadata at the end of the file, and write the metadata offset at the beginning on the
     /// file.
-    pub fn finish(self) -> Result<(), Error> {
+    pub fn finish(self) -> Result<usize, Error> {
         let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self;
 
         let meta = DocumentsMetadata { count, index };
@@ -74,7 +74,7 @@ impl<W: io::Write + io::Seek> DocumentBatchBuilder<W> {
 
         writer.flush()?;
 
-        Ok(())
+        Ok(count)
     }
 
     /// Extends the builder with json documents from a reader.

From ed6db196810f78632758fc386f8a7f5f6cd6f357 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 28 Oct 2021 11:18:32 +0200
Subject: [PATCH 16/22] Fix PR comments

---
 milli/src/error.rs                         |  8 ++++----
 milli/src/search/facet/filter_condition.rs | 22 +++++++++++++++-------
 milli/src/search/mod.rs                    |  4 ++--
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/milli/src/error.rs b/milli/src/error.rs
index a4125d117..be8458ce6 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -1,4 +1,4 @@
-use std::collections::HashSet;
+use std::collections::BTreeSet;
 use std::convert::Infallible;
 use std::error::Error as StdError;
 use std::{fmt, io, str};
@@ -58,10 +58,10 @@ pub enum UserError {
     CriterionError(CriterionError),
     DocumentLimitReached,
     InvalidDocumentId { document_id: Value },
-    InvalidFacetsDistribution { invalid_facets_name: HashSet<String> },
+    InvalidFacetsDistribution { invalid_facets_name: BTreeSet<String> },
     InvalidFilter(FilterError),
     InvalidGeoField { document_id: Value, object: Value },
-    InvalidSortableAttribute { field: String, valid_fields: HashSet<String> },
+    InvalidSortableAttribute { field: String, valid_fields: BTreeSet<String> },
     SortRankingRuleMissing,
     InvalidStoreFile,
     MaxDatabaseSizeReached,
@@ -76,7 +76,7 @@ pub enum UserError {
 
 #[derive(Debug)]
 pub enum FilterError {
-    InvalidAttribute { field: String, valid_fields: HashSet<String> },
+    InvalidAttribute { field: String, valid_fields: BTreeSet<String> },
     ReservedKeyword { field: String, context: Option<String> },
     Syntax(pest::error::Error<ParserRule>),
 }
diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs
index f8d40aefb..cd7bcdc4f 100644
--- a/milli/src/search/facet/filter_condition.rs
+++ b/milli/src/search/facet/filter_condition.rs
@@ -169,7 +169,7 @@ impl FilterCondition {
         if !filterable_fields.contains("_geo") {
             return Err(FilterError::InvalidAttribute {
                 field: "_geo".to_string(),
-                valid_fields: filterable_fields.clone(),
+                valid_fields: filterable_fields.into_iter().cloned().collect(),
             }
             .into());
         }
@@ -192,7 +192,7 @@ impl FilterCondition {
         if parameters.len() != 3 {
             return Err(FilterError::Syntax(PestError::new_from_span(
                         ErrorVariant::CustomError {
-                            message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"),
+                            message: format!("The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)"),
                         },
                         // we want to point to the last parameters and if there was no parameters we
                         // point to the parenthesis
@@ -599,7 +599,7 @@ fn field_id(
     if !filterable_fields.contains(key.as_str()) {
         return Err(FilterError::InvalidAttribute {
             field: key.as_str().to_string(),
-            valid_fields: filterable_fields.clone(),
+            valid_fields: filterable_fields.into_iter().cloned().collect(),
         });
     }
 
@@ -829,26 +829,34 @@ mod tests {
         let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius");
         assert!(result.is_err());
         let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+        assert!(error.to_string().contains(
+            "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)"
+        ));
 
         // georadius don't have any parameters
         let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()");
         assert!(result.is_err());
         let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+        assert!(error.to_string().contains(
+            "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)"
+        ));
 
         // georadius don't have enough parameters
         let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)");
         assert!(result.is_err());
         let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+        assert!(error.to_string().contains(
+            "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)"
+        ));
 
         // georadius have too many parameters
         let result =
             FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)");
         assert!(result.is_err());
         let error = result.unwrap_err();
-        assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"));
+        assert!(error.to_string().contains(
+            "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)"
+        ));
 
         // georadius have a bad latitude
         let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)");
diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs
index bec059d46..aa2544091 100644
--- a/milli/src/search/mod.rs
+++ b/milli/src/search/mod.rs
@@ -151,13 +151,13 @@ impl<'a> Search<'a> {
                     Member::Field(ref field) if !sortable_fields.contains(field) => {
                         return Err(UserError::InvalidSortableAttribute {
                             field: field.to_string(),
-                            valid_fields: sortable_fields,
+                            valid_fields: sortable_fields.into_iter().collect(),
                         })?
                     }
                     Member::Geo(_) if !sortable_fields.contains("_geo") => {
                         return Err(UserError::InvalidSortableAttribute {
                             field: "_geo".to_string(),
-                            valid_fields: sortable_fields,
+                            valid_fields: sortable_fields.into_iter().collect(),
                         })?
                     }
                     _ => (),

From 9f1e0d2a49447f106277b8a07e0bba65370b47c8 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 28 Oct 2021 14:47:17 +0200
Subject: [PATCH 17/22] Refine asc/desc error messages

---
 milli/src/asc_desc.rs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs
index 8d4973c2f..f07e1ded8 100644
--- a/milli/src/asc_desc.rs
+++ b/milli/src/asc_desc.rs
@@ -28,7 +28,7 @@ impl fmt::Display for AscDescError {
                 write!(f, "Longitude must be contained between -180 and 180 degrees.",)
             }
             Self::InvalidSyntax { name } => {
-                write!(f, "invalid asc/desc syntax for `{}`.", name)
+                write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name)
             }
             Self::ReservedKeyword { name } => {
                 write!(
@@ -192,8 +192,8 @@ impl fmt::Display for SortError {
             Self::BadGeoPointUsage { name } => {
                 write!(
                     f,
-                    "Invalid syntax for the `_geoPoint` parameter: `{}`. \
-                    Usage: `_geoPoint(latitude, longitude):asc`.",
+                    "Invalid syntax for the geo parameter: expected expression formated like \
+                    `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.",
                     name
                 )
             }

From 056ff13c4d981a37fce66b6f1c7eee6a375920d0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= <clementine@meilisearch.com>
Date: Thu, 28 Oct 2021 14:52:57 +0200
Subject: [PATCH 18/22] Update version for the next release (v0.20.0)

---
 helpers/Cargo.toml | 2 +-
 http-ui/Cargo.toml | 2 +-
 infos/Cargo.toml   | 2 +-
 milli/Cargo.toml   | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml
index 341b8eb7c..2e71dfb21 100644
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 310388e01..824771ce7 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/infos/Cargo.toml b/infos/Cargo.toml
index 726fa9c5f..461c22de0 100644
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 209c8b1f7..375cbf0dc 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.19.0"
+version = "0.20.0"
 authors = ["Kerollmops <clement@meilisearch.com>"]
 edition = "2018"
 

From 0c0038488c4881c4b5dd8d1c8e8ba59ea7d5737c Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 3 Nov 2021 11:24:06 +0100
Subject: [PATCH 19/22] Change last error messages

---
 milli/src/error.rs | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/milli/src/error.rs b/milli/src/error.rs
index be8458ce6..59744a32e 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -232,11 +232,21 @@ impl fmt::Display for UserError {
                 )
             }
             Self::InvalidFilter(error) => error.fmt(f),
-            Self::InvalidGeoField { document_id, object } => write!(
-                f,
-                "The document with the id: `{}` contains an invalid _geo field: `{}`.",
-                document_id, object
-            ),
+            Self::InvalidGeoField { document_id, object } => {
+                let document_id = match document_id {
+                    Value::String(id) => id.clone(),
+                    _ => document_id.to_string(),
+                };
+                let object = match object {
+                    Value::String(id) => id.clone(),
+                    _ => object.to_string(),
+                };
+                write!(
+                    f,
+                    "The document with the id: `{}` contains an invalid _geo field: `{}`.",
+                    document_id, object
+                )
+            },
             Self::InvalidDocumentId { document_id } => {
                 let document_id = match document_id {
                     Value::String(id) => id.clone(),
@@ -268,10 +278,9 @@ ranking rules settings to use the sort parameter at search time.",
                 write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json)
             }
             Self::MissingPrimaryKey => f.write_str("Missing primary key."),
-            Self::MaxDatabaseSizeReached => f.write_str("Maximum database size reached."),
-            // TODO where can we find it instead of writing the text ourselves?
-            Self::NoSpaceLeftOnDevice => f.write_str("No space left on device."),
-            Self::InvalidStoreFile => f.write_str("Store file is not a valid database file."),
+            Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."),
+            Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."),
+            Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."),
             Self::PrimaryKeyCannotBeChanged(primary_key) => {
                 write!(f, "Index already has a primary key: `{}`.", primary_key)
             }

From 702589104d45e5c1f51eac2ecaa269727ebf34a8 Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Wed, 3 Nov 2021 14:20:01 +0100
Subject: [PATCH 20/22] Update version for the next release (v0.20.1)

---
 cli/Cargo.toml     | 2 +-
 helpers/Cargo.toml | 2 +-
 http-ui/Cargo.toml | 2 +-
 infos/Cargo.toml   | 2 +-
 milli/Cargo.toml   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cli/Cargo.toml b/cli/Cargo.toml
index 24fb214b9..44f7d110b 100644
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cli"
-version = "0.1.0"
+version = "0.20.1"
 edition = "2018"
 description = "A CLI to interact with a milli index"
 
diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml
index 2e71dfb21..da26a8baa 100644
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.20.0"
+version = "0.20.1"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 824771ce7..43980009a 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.20.0"
+version = "0.20.1"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/infos/Cargo.toml b/infos/Cargo.toml
index 461c22de0..c96ff094c 100644
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.20.0"
+version = "0.20.1"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index 375cbf0dc..fc45b5355 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.20.0"
+version = "0.20.1"
 authors = ["Kerollmops <clement@meilisearch.com>"]
 edition = "2018"
 

From 7b3bac46a0cc6e58e1ee5456f6aa1550cdf2852a Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 4 Nov 2021 13:19:32 +0100
Subject: [PATCH 21/22] Change Attribute and Ranking rules errors

---
 milli/src/criterion.rs | 2 +-
 milli/src/error.rs     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs
index 0586fcc0f..aca2f95b5 100644
--- a/milli/src/criterion.rs
+++ b/milli/src/criterion.rs
@@ -17,7 +17,7 @@ pub enum CriterionError {
 impl fmt::Display for CriterionError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name),
+            Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name),
             Self::ReservedName { name } => {
                 write!(
                     f,
diff --git a/milli/src/error.rs b/milli/src/error.rs
index 59744a32e..9e8ad515d 100644
--- a/milli/src/error.rs
+++ b/milli/src/error.rs
@@ -219,7 +219,7 @@ impl StdError for InternalError {}
 impl fmt::Display for UserError {
     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
         match self {
-            Self::AttributeLimitReached => f.write_str("Maximum number of attributes reached."),
+            Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."),
             Self::CriterionError(error) => write!(f, "{}", error),
             Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."),
             Self::InvalidFacetsDistribution { invalid_facets_name } => {
@@ -277,7 +277,7 @@ ranking rules settings to use the sort parameter at search time.",
                 let json = serde_json::to_string(document).unwrap();
                 write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json)
             }
-            Self::MissingPrimaryKey => f.write_str("Missing primary key."),
+            Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."),
             Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."),
             Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."),
             Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."),

From 743ed9f57f04decf5fb6b0d43b0a5a609cbebf3c Mon Sep 17 00:00:00 2001
From: many <maxime@meilisearch.com>
Date: Thu, 4 Nov 2021 14:04:21 +0100
Subject: [PATCH 22/22] Bump milli version

---
 cli/Cargo.toml     | 2 +-
 helpers/Cargo.toml | 2 +-
 http-ui/Cargo.toml | 2 +-
 infos/Cargo.toml   | 2 +-
 milli/Cargo.toml   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cli/Cargo.toml b/cli/Cargo.toml
index 44f7d110b..eb03842ca 100644
--- a/cli/Cargo.toml
+++ b/cli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cli"
-version = "0.20.1"
+version = "0.20.2"
 edition = "2018"
 description = "A CLI to interact with a milli index"
 
diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml
index da26a8baa..5b33d2a4f 100644
--- a/helpers/Cargo.toml
+++ b/helpers/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "helpers"
-version = "0.20.1"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml
index 43980009a..04e1c708a 100644
--- a/http-ui/Cargo.toml
+++ b/http-ui/Cargo.toml
@@ -1,7 +1,7 @@
 [package]
 name = "http-ui"
 description = "The HTTP user interface of the milli search engine"
-version = "0.20.1"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/infos/Cargo.toml b/infos/Cargo.toml
index c96ff094c..645bc4cdd 100644
--- a/infos/Cargo.toml
+++ b/infos/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "infos"
-version = "0.20.1"
+version = "0.20.2"
 authors = ["Clément Renault <clement@meilisearch.com>"]
 edition = "2018"
 
diff --git a/milli/Cargo.toml b/milli/Cargo.toml
index fc45b5355..5aa04e569 100644
--- a/milli/Cargo.toml
+++ b/milli/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "milli"
-version = "0.20.1"
+version = "0.20.2"
 authors = ["Kerollmops <clement@meilisearch.com>"]
 edition = "2018"