From f8fe9316c08b924086a700a83616b8fa619707f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 21 Oct 2021 11:56:14 +0200 Subject: [PATCH 01/22] Update version for the next release (v0.18.1) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 2f3c26a7b..b3a15f100 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 83d5f67c4..a7af6fb9b 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index b0220993f..2701c36d7 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 594cc60e0..01309797d 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.18.0" +version = "0.18.1" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index b55c4b8d6..dd74ad7b6 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.18.0" +version = "0.18.1" authors = ["Clément Renault "] edition = "2018" From 8d70b01714acdca6e27dccbf08e6c42829973ccd Mon Sep 17 00:00:00 2001 From: marin postma Date: Wed, 20 Oct 2021 21:26:52 +0200 Subject: [PATCH 02/22] optimize document deserialization --- milli/src/documents/builder.rs | 142 ++++++- milli/src/documents/mod.rs | 20 +- milli/src/documents/serde.rs | 532 +++++------------------- milli/src/search/distinct/mod.rs | 3 +- milli/src/update/index_documents/mod.rs | 3 +- milli/tests/search/mod.rs | 7 +- milli/tests/search/query_criteria.rs | 3 +- 7 files changed, 241 insertions(+), 469 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index ba1319eff..98213edd7 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,13 @@ +use std::collections::BTreeMap; use std::io; use byteorder::{BigEndian, WriteBytesExt}; -use serde::ser::Serialize; +use serde::Deserializer; +use serde_json::Value; -use super::serde::DocumentSerializer; +use crate::FieldId; + +use super::serde::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary @@ -24,7 +28,12 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// builder.finish().unwrap(); /// ``` pub struct DocumentBatchBuilder { - serializer: DocumentSerializer, + inner: ByteCounter, + index: DocumentsBatchIndex, + obkv_buffer: Vec, + value_buffer: Vec, + values: BTreeMap, + count: usize, } impl DocumentBatchBuilder { @@ -34,27 +43,33 @@ impl DocumentBatchBuilder { // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; - let serializer = - DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; + let this = Self { + inner: writer, + index, + obkv_buffer: Vec::new(), + value_buffer: Vec::new(), + values: BTreeMap::new(), + count: 0, + }; - Ok(Self { serializer }) + Ok(this) } /// Returns the number of documents that have been written to the builder. pub fn len(&self) -> usize { - self.serializer.count + self.count } /// This method must be called after the document addition is terminated. It will put the /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. pub fn finish(self) -> Result<(), Error> { - let DocumentSerializer { - writer: ByteCounter { mut writer, count: offset }, + let Self { + inner: ByteCounter { mut writer, count: offset }, index, count, .. - } = self.serializer; + } = self; let meta = DocumentsMetadata { count, index }; @@ -68,13 +83,106 @@ impl DocumentBatchBuilder { Ok(()) } - /// Adds documents to the builder. - /// - /// The internal index is updated with the fields found - /// in the documents. Document must either be a map or a sequences of map, anything else will - /// fail. - pub fn add_documents(&mut self, document: T) -> Result<(), Error> { - document.serialize(&mut self.serializer)?; + + /// Extends the builder with json documents from a reader. + pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { + let mut de = serde_json::Deserializer::from_reader(reader); + + let mut visitor = DocumentVisitor { + inner: &mut self.inner, + index: &mut self.index, + obkv_buffer: &mut self.obkv_buffer, + value_buffer: &mut self.value_buffer, + values: &mut self.values, + count: &mut self.count, + }; + + de.deserialize_any(&mut visitor).unwrap(); + Ok(()) } } + +#[cfg(test)] +mod test { + use std::io::Cursor; + + use crate::documents::DocumentBatchReader; + + use super::*; + + #[test] + fn add_single_documents_json() { + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let json = serde_json::json!({ + "id": 1, + "field": "hello!", + }); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + let json = serde_json::json!({ + "blabla": false, + "field": "hello!", + "id": 1, + }); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + assert_eq!(builder.len(), 2); + + builder.finish().unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 2); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 3); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn add_documents_seq_json() { + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let json = serde_json::json!([{ + "id": 1, + "field": "hello!", + },{ + "blabla": false, + "field": "hello!", + "id": 1, + } + ]); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + assert_eq!(builder.len(), 2); + + builder.finish().unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 2); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 3); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } +} diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index f79c210fe..ce0539c24 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -92,7 +92,8 @@ macro_rules! documents { let documents = serde_json::json!($data); let mut writer = std::io::Cursor::new(Vec::new()); let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - builder.add_documents(documents).unwrap(); + let documents = serde_json::to_vec(&documents).unwrap(); + builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); builder.finish().unwrap(); writer.set_position(0); @@ -124,7 +125,8 @@ mod test { let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(json).unwrap(); + todo!(); + //builder.add_documents(json).unwrap(); builder.finish().unwrap(); @@ -153,8 +155,9 @@ mod test { let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(doc1).unwrap(); - builder.add_documents(doc2).unwrap(); + todo!(); + //builder.add_documents(doc1).unwrap(); + //builder.add_documents(doc2).unwrap(); builder.finish().unwrap(); @@ -182,7 +185,8 @@ mod test { let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(docs).unwrap(); + todo!(); + //builder.add_documents(docs).unwrap(); builder.finish().unwrap(); @@ -210,11 +214,13 @@ mod test { { "tata": "hello" }, ]]); - assert!(builder.add_documents(docs).is_err()); + todo!(); + //assert!(builder.add_documents(docs).is_err()); let docs = json!("hello"); - assert!(builder.add_documents(docs).is_err()); + todo!(); + //assert!(builder.add_documents(docs).is_err()); } #[test] diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 036ec246a..0d02fff6c 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,474 +1,128 @@ use std::collections::BTreeMap; -use std::convert::TryInto; use std::io::Cursor; -use std::{fmt, io}; +use std::io::Write; +use std::fmt; -use byteorder::{BigEndian, WriteBytesExt}; -use obkv::KvWriter; -use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; +use byteorder::WriteBytesExt; +use serde::Deserialize; +use serde::de::DeserializeSeed; +use serde::de::MapAccess; +use serde::de::SeqAccess; +use serde::de::Visitor; use serde_json::Value; -use super::{ByteCounter, DocumentsBatchIndex, Error}; +use super::{ByteCounter, DocumentsBatchIndex}; use crate::FieldId; -pub struct DocumentSerializer { - pub writer: ByteCounter, - pub buffer: Vec, - pub index: DocumentsBatchIndex, - pub count: usize, - pub allow_seq: bool, -} +struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); -impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer { - type Ok = (); +impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { + type Value = FieldId; - type Error = Error; - - type SerializeSeq = SeqSerializer<'a, W>; - type SerializeTuple = Impossible<(), Self::Error>; - type SerializeTupleStruct = Impossible<(), Self::Error>; - type SerializeTupleVariant = Impossible<(), Self::Error>; - type SerializeMap = MapSerializer<'a, &'a mut ByteCounter>; - type SerializeStruct = Impossible<(), Self::Error>; - type SerializeStructVariant = Impossible<(), Self::Error>; - fn serialize_map(self, _len: Option) -> Result { - self.buffer.clear(); - let cursor = io::Cursor::new(&mut self.buffer); - self.count += 1; - let map_serializer = MapSerializer { - map: KvWriter::new(cursor), - index: &mut self.index, - writer: &mut self.writer, - mapped_documents: BTreeMap::new(), - }; - - Ok(map_serializer) - } - - fn serialize_seq(self, _len: Option) -> Result { - if self.allow_seq { - // Only allow sequence of documents of depth 1. - self.allow_seq = false; - Ok(SeqSerializer { serializer: self }) - } else { - Err(Error::InvalidDocumentFormat) - } - } - - fn serialize_bool(self, _v: bool) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i8(self, _v: i8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i16(self, _v: i16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i32(self, _v: i32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i64(self, _v: i64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u8(self, _v: u8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u16(self, _v: u16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u32(self, _v: u32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u64(self, _v: u64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f32(self, _v: f32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f64(self, _v: f64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_char(self, _v: char) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_str(self, _v: &str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_none(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_some(self, _value: &T) -> Result + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) + D: serde::Deserializer<'de> { + deserializer.deserialize_str(self) } } -pub struct SeqSerializer<'a, W> { - serializer: &'a mut DocumentSerializer, +impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { + type Value = FieldId; + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, { + let field_id = match self.0.get_by_right(v) { + Some(field_id) => *field_id, + None => { + let field_id = self.0.len() as FieldId; + self.0.insert(field_id, v.to_string()); + field_id + } + }; + + Ok(field_id) + } + + fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result { + todo!() + } } -impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { - type Ok = (); - type Error = Error; +struct ValueDeserializer; - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> +impl<'de> DeserializeSeed<'de> for ValueDeserializer { + type Value = serde_json::Value; + + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, + D: serde::Deserializer<'de> { + serde_json::Value::deserialize(deserializer) + } +} + +pub struct DocumentVisitor<'a, W> { + pub inner: &'a mut ByteCounter, + pub index: &'a mut DocumentsBatchIndex, + pub obkv_buffer: &'a mut Vec, + pub value_buffer: &'a mut Vec, + pub values: &'a mut BTreeMap, + pub count: &'a mut usize, +} + +impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { + /// This Visitor value is nothing, since it write the value to a file. + type Value = (); + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, { - value.serialize(&mut *self.serializer)?; + while let Some(_) = seq.next_element_seed(&mut *self)? { } + Ok(()) } - fn end(self) -> Result { - Ok(()) - } -} - -pub struct MapSerializer<'a, W> { - map: KvWriter>, FieldId>, - index: &'a mut DocumentsBatchIndex, - writer: W, - mapped_documents: BTreeMap, -} - -/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and -/// serialize_value, therefore these to methods remain unimplemented. -impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { - type Ok = (); - type Error = Error; - - fn serialize_key(&mut self, _key: &T) -> Result<(), Self::Error> { - unreachable!() - } - - fn serialize_value(&mut self, _value: &T) -> Result<(), Self::Error> { - unreachable!() - } - - fn end(mut self) -> Result { - let mut buf = Vec::new(); - for (key, value) in self.mapped_documents { - buf.clear(); - let mut cursor = Cursor::new(&mut buf); - serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?; - self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?; + fn visit_map(self, mut map: A) -> Result + where + A: MapAccess<'de>, + { + while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer).unwrap() { + self.values.insert(key, value); } - let data = self.map.into_inner().map_err(Error::Io)?.into_inner(); - let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?; + self.obkv_buffer.clear(); + let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); + for (key, value) in self.values.iter() { + self.value_buffer.clear(); + // This is guaranteed to work + serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value).unwrap(); + obkv.insert(*key, &self.value_buffer).unwrap(); + } - self.writer.write_u32::(data_len).map_err(Error::Io)?; - self.writer.write_all(&data).map_err(Error::Io)?; + let reader = obkv.into_inner().unwrap().into_inner(); + + self.inner.write_u32::(reader.len() as u32).unwrap(); + self.inner.write_all(reader).unwrap(); + + *self.count += 1; Ok(()) } - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where - K: Serialize, - V: Serialize, - { - let field_serializer = FieldSerializer { index: &mut self.index }; - let field_id: FieldId = key.serialize(field_serializer)?; - - let value = serde_json::to_value(value).map_err(Error::JsonError)?; - - self.mapped_documents.insert(field_id, value); - - Ok(()) + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a documents, or a sequence of documents.") } } -struct FieldSerializer<'a> { - index: &'a mut DocumentsBatchIndex, -} +impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +where W: Write, +{ + type Value = (); -impl<'a> serde::Serializer for FieldSerializer<'a> { - type Ok = FieldId; - - type Error = Error; - - type SerializeSeq = Impossible; - type SerializeTuple = Impossible; - type SerializeTupleStruct = Impossible; - type SerializeTupleVariant = Impossible; - type SerializeMap = Impossible; - type SerializeStruct = Impossible; - type SerializeStructVariant = Impossible; - - fn serialize_str(self, ws: &str) -> Result { - let field_id = match self.index.get_by_right(ws) { - Some(field_id) => *field_id, - None => { - let field_id = self.index.len() as FieldId; - self.index.insert(field_id, ws.to_string()); - field_id - } - }; - - Ok(field_id) - } - - fn serialize_bool(self, _v: bool) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i8(self, _v: i8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i16(self, _v: i16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i32(self, _v: i32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i64(self, _v: i64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u8(self, _v: u8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u16(self, _v: u16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u32(self, _v: u32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u64(self, _v: u64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f32(self, _v: f32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f64(self, _v: f64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_char(self, _v: char) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_none(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_some(self, _value: &T) -> Result + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } -} - -impl serde::ser::Error for Error { - fn custom(msg: T) -> Self { - Error::Custom(msg.to_string()) + D: serde::Deserializer<'de> { + deserializer.deserialize_map(self) } } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index deb51a053..5639c53fa 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -68,7 +68,8 @@ mod test { "txts": sample_txts[..(rng.gen_range(0..3))], "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - builder.add_documents(doc).unwrap(); + todo!() + //builder.add_documents(doc).unwrap(); } builder.finish().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 92bcab0e9..58e21a615 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -877,7 +877,8 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(big_object).unwrap(); + todo!(); + //builder.add_documents(big_object).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index cda0da617..d62b8ec31 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,9 +61,10 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - documents_builder.add_documents(doc.unwrap()).unwrap(); - } + todo!(); + //for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { + //documents_builder.add_documents(doc.unwrap()).unwrap(); + //} documents_builder.finish().unwrap(); cursor.set_position(0); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f3b04c4fa..3fb36b1d5 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -409,7 +409,8 @@ fn criteria_ascdesc() { "age": age, }); - batch_builder.add_documents(json).unwrap(); + todo!(); + //batch_builder.add_documents(json).unwrap(); }); batch_builder.finish().unwrap(); From 0f86d6b28f333b2f75439ef3483b0f8f2f59c824 Mon Sep 17 00:00:00 2001 From: marin postma Date: Thu, 21 Oct 2021 11:05:16 +0200 Subject: [PATCH 03/22] implement csv serialization --- milli/Cargo.toml | 1 + milli/src/documents/builder.rs | 103 +++++++++++++++++- milli/src/documents/mod.rs | 33 +++++- milli/src/documents/serde.rs | 14 +-- milli/src/update/index_documents/transform.rs | 8 +- 5 files changed, 142 insertions(+), 17 deletions(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 594cc60e0..709f8d865 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -47,6 +47,7 @@ itertools = "0.10.0" # logging log = "0.4.14" logging_timer = "1.0.0" +csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 98213edd7..719580b4a 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,5 +1,8 @@ use std::collections::BTreeMap; +use std::collections::HashMap; use std::io; +use std::io::Cursor; +use std::io::Write; use byteorder::{BigEndian, WriteBytesExt}; use serde::Deserializer; @@ -38,7 +41,7 @@ pub struct DocumentBatchBuilder { impl DocumentBatchBuilder { pub fn new(writer: W) -> Result { - let index = DocumentsBatchIndex::new(); + let index = DocumentsBatchIndex::default(); let mut writer = ByteCounter::new(writer); // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; @@ -101,6 +104,79 @@ impl DocumentBatchBuilder { Ok(()) } + + /// Extends the builder with json documents from a reader. + /// + /// This method can be only called once and is mutually exclusive with extend from json. This + /// is because the fields in a csv are always guaranteed to come in order, and permits some + /// optimizations. + /// + /// From csv takes care to call finish in the end. + pub fn from_csv(mut self, reader: R) -> Result<(), Error> { + + // Ensure that this is the first and only addition made with this builder + debug_assert!(self.index.is_empty()); + + let mut records = csv::Reader::from_reader(reader); + + let headers = records + .headers() + .unwrap() + .into_iter() + .map(parse_csv_header) + .map(|(k, t)| (self.index.insert(&k), t)) + .collect::>(); + + let records = records.into_records(); + + dbg!(&headers); + for record in records { + match record { + Ok(record) => { + let mut writer = obkv::KvWriter::new(Cursor::new(&mut self.obkv_buffer)); + for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { + let value = match ty { + AllowedType::Number => value.parse::().map(Value::from).unwrap(), + AllowedType::String => Value::String(value.to_string()), + }; + + serde_json::to_writer(Cursor::new(&mut self.value_buffer), dbg!(&value)).unwrap(); + writer.insert(*fid, &self.value_buffer)?; + self.value_buffer.clear(); + } + + self.inner.write_u32::(self.obkv_buffer.len() as u32)?; + self.inner.write_all(&self.obkv_buffer)?; + + self.obkv_buffer.clear(); + self.count += 1; + }, + Err(_) => panic!(), + } + } + + self.finish()?; + + Ok(()) + } +} + +#[derive(Debug)] +enum AllowedType { + String, + Number, +} + +fn parse_csv_header(header: &str) -> (String, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name.to_string(), AllowedType::String), + "number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. + _otherwise => (header.to_string(), AllowedType::String), + }, + None => (header.to_string(), AllowedType::String), + } } #[cfg(test)] @@ -185,4 +261,29 @@ mod test { assert!(reader.next_document_with_index().unwrap().is_none()); } + + #[test] + fn add_documents_csv() { + let mut cursor = Cursor::new(Vec::new()); + let builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let csv = "id:number,field:string\n1,hello!\n2,blabla"; + + builder.from_csv(Cursor::new(csv.as_bytes())).unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + dbg!(reader.len()); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 2); + assert_eq!(document.iter().count(), 2); + + let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(document.iter().count(), 2); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index ce0539c24..9f6ebd3de 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -17,7 +17,38 @@ pub use reader::DocumentBatchReader; use crate::FieldId; /// A bidirectional map that links field ids to their name in a document batch. -pub type DocumentsBatchIndex = BiHashMap; +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct DocumentsBatchIndex(pub BiHashMap); + +impl DocumentsBatchIndex { + /// Insert the field in the map, or return it's field id if it doesn't already exists. + pub fn insert(&mut self, field: &str) -> FieldId { + match self.0.get_by_right(field) { + Some(field_id) => *field_id, + None => { + let field_id = self.0.len() as FieldId; + self.0.insert(field_id, field.to_string()); + field_id + } + } + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn iter(&self) -> impl Iterator { + self.0.iter() + } + + pub fn get_id(&self, id: FieldId) -> Option<&String> { + self.0.get_by_left(&id) + } +} #[derive(Debug, Serialize, Deserialize)] struct DocumentsMetadata { diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 0d02fff6c..86fb68534 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -31,17 +31,9 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { fn visit_str(self, v: &str) -> Result where - E: serde::de::Error, { - let field_id = match self.0.get_by_right(v) { - Some(field_id) => *field_id, - None => { - let field_id = self.0.len() as FieldId; - self.0.insert(field_id, v.to_string()); - field_id - } - }; - - Ok(field_id) + E: serde::de::Error, + { + Ok(self.0.insert(v)) } fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result { diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c0c88abed..5af1eda72 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -75,7 +75,7 @@ fn create_fields_mapping( .collect() } -fn find_primary_key(index: &bimap::BiHashMap) -> Option<&str> { +fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { index .iter() .sorted_by_key(|(k, _)| *k) @@ -179,7 +179,7 @@ impl Transform<'_, '_> { if !self.autogenerate_docids { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.get_by_left(&key).cloned(); + let key = addition_index.get_id(key).cloned(); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { @@ -544,7 +544,7 @@ mod test { mod primary_key_inference { use bimap::BiHashMap; - use crate::update::index_documents::transform::find_primary_key; + use crate::{documents::DocumentsBatchIndex, update::index_documents::transform::find_primary_key}; #[test] fn primary_key_infered_on_first_field() { @@ -557,7 +557,7 @@ mod test { map.insert(4, "fakeId".to_string()); map.insert(0, "realId".to_string()); - assert_eq!(find_primary_key(&map), Some("realId")); + assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); } } } From 2e62925a6e5f6ee864410322758b32c39a8f1a38 Mon Sep 17 00:00:00 2001 From: marin postma Date: Sun, 24 Oct 2021 14:41:36 +0200 Subject: [PATCH 04/22] fix tests --- milli/src/documents/builder.rs | 51 +++++++++---------------- milli/src/documents/mod.rs | 28 ++++++++------ milli/src/documents/serde.rs | 1 + milli/src/index.rs | 1 + milli/src/search/distinct/mod.rs | 5 ++- milli/src/update/index_documents/mod.rs | 4 +- milli/tests/search/mod.rs | 10 +++-- milli/tests/search/query_criteria.rs | 4 +- 8 files changed, 49 insertions(+), 55 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 719580b4a..8c70910b5 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,5 +1,4 @@ use std::collections::BTreeMap; -use std::collections::HashMap; use std::io; use std::io::Cursor; use std::io::Write; @@ -18,18 +17,6 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to /// iterate other the documents. -/// -/// ## example: -/// ``` -/// use milli::documents::DocumentBatchBuilder; -/// use serde_json::json; -/// use std::io::Cursor; -/// -/// let mut writer = Cursor::new(Vec::new()); -/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap(); -/// builder.finish().unwrap(); -/// ``` pub struct DocumentBatchBuilder { inner: ByteCounter, index: DocumentsBatchIndex, @@ -100,7 +87,7 @@ impl DocumentBatchBuilder { count: &mut self.count, }; - de.deserialize_any(&mut visitor).unwrap(); + de.deserialize_any(&mut visitor).map_err(Error::JsonError)?; Ok(()) } @@ -112,10 +99,11 @@ impl DocumentBatchBuilder { /// optimizations. /// /// From csv takes care to call finish in the end. - pub fn from_csv(mut self, reader: R) -> Result<(), Error> { + pub fn from_csv(reader: R, writer: W) -> Result { + let mut this = Self::new(writer)?; // Ensure that this is the first and only addition made with this builder - debug_assert!(self.index.is_empty()); + debug_assert!(this.index.is_empty()); let mut records = csv::Reader::from_reader(reader); @@ -124,40 +112,37 @@ impl DocumentBatchBuilder { .unwrap() .into_iter() .map(parse_csv_header) - .map(|(k, t)| (self.index.insert(&k), t)) - .collect::>(); + .map(|(k, t)| (this.index.insert(&k), t)) + .collect::>(); let records = records.into_records(); - dbg!(&headers); for record in records { match record { Ok(record) => { - let mut writer = obkv::KvWriter::new(Cursor::new(&mut self.obkv_buffer)); + let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { AllowedType::Number => value.parse::().map(Value::from).unwrap(), AllowedType::String => Value::String(value.to_string()), }; - serde_json::to_writer(Cursor::new(&mut self.value_buffer), dbg!(&value)).unwrap(); - writer.insert(*fid, &self.value_buffer)?; - self.value_buffer.clear(); + serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value).unwrap(); + writer.insert(*fid, &this.value_buffer)?; + this.value_buffer.clear(); } - self.inner.write_u32::(self.obkv_buffer.len() as u32)?; - self.inner.write_all(&self.obkv_buffer)?; + this.inner.write_u32::(this.obkv_buffer.len() as u32)?; + this.inner.write_all(&this.obkv_buffer)?; - self.obkv_buffer.clear(); - self.count += 1; + this.obkv_buffer.clear(); + this.count += 1; }, Err(_) => panic!(), } } - self.finish()?; - - Ok(()) + Ok(this) } } @@ -265,18 +250,16 @@ mod test { #[test] fn add_documents_csv() { let mut cursor = Cursor::new(Vec::new()); - let builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let csv = "id:number,field:string\n1,hello!\n2,blabla"; - builder.from_csv(Cursor::new(csv.as_bytes())).unwrap(); + let builder = DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); + builder.finish().unwrap(); cursor.set_position(0); let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - dbg!(reader.len()); - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); assert_eq!(index.len(), 2); assert_eq!(document.iter().count(), 2); diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 9f6ebd3de..8a8b87794 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -135,6 +135,8 @@ macro_rules! documents { #[cfg(test)] mod test { + use std::io::Cursor; + use serde_json::{json, Value}; use super::*; @@ -151,13 +153,14 @@ mod test { "bool": true }); + let json = serde_json::to_vec(&json).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(json).unwrap(); + builder.extend_from_json(Cursor::new(json)).unwrap(); builder.finish().unwrap(); @@ -181,14 +184,16 @@ mod test { "toto": false, }); + let doc1 = serde_json::to_vec(&doc1).unwrap(); + let doc2 = serde_json::to_vec(&doc2).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(doc1).unwrap(); - //builder.add_documents(doc2).unwrap(); + builder.extend_from_json(Cursor::new(doc1)).unwrap(); + builder.extend_from_json(Cursor::new(doc2)).unwrap(); builder.finish().unwrap(); @@ -211,13 +216,14 @@ mod test { { "tata": "hello" }, ]); + let docs = serde_json::to_vec(&docs).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(docs).unwrap(); + builder.extend_from_json(Cursor::new(docs)).unwrap(); builder.finish().unwrap(); @@ -245,13 +251,13 @@ mod test { { "tata": "hello" }, ]]); - todo!(); - //assert!(builder.add_documents(docs).is_err()); + let docs = serde_json::to_vec(&docs).unwrap(); + assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); let docs = json!("hello"); + let docs = serde_json::to_vec(&docs).unwrap(); - todo!(); - //assert!(builder.add_documents(docs).is_err()); + assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); } #[test] diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 86fb68534..2466ed373 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -98,6 +98,7 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { self.inner.write_all(reader).unwrap(); *self.count += 1; + self.values.clear(); Ok(()) } diff --git a/milli/src/index.rs b/milli/src/index.rs index 6ce693fbe..fe89fe734 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -953,6 +953,7 @@ pub(crate) mod tests { { "id": 1, "name": "kevin", "has_dog": true }, { "id": 2, "name": "bob" } ]); + let mut wtxn = index.write_txn().unwrap(); let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 5639c53fa..11f6379e3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -68,8 +68,9 @@ mod test { "txts": sample_txts[..(rng.gen_range(0..3))], "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - todo!() - //builder.add_documents(doc).unwrap(); + + let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); + builder.extend_from_json(doc).unwrap(); } builder.finish().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 58e21a615..17c778060 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -877,8 +877,8 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - todo!(); - //builder.add_documents(big_object).unwrap(); + let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); + builder.extend_from_json(big_object).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index d62b8ec31..e8fb3fdfa 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,10 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); - todo!(); - //for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - //documents_builder.add_documents(doc.unwrap()).unwrap(); - //} + + for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { + let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); + documents_builder.extend_from_json(doc).unwrap(); + } + documents_builder.finish().unwrap(); cursor.set_position(0); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 3fb36b1d5..e5dde049c 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -409,8 +409,8 @@ fn criteria_ascdesc() { "age": age, }); - todo!(); - //batch_builder.add_documents(json).unwrap(); + let json = Cursor::new(serde_json::to_vec(&json).unwrap()); + batch_builder.extend_from_json(json).unwrap(); }); batch_builder.finish().unwrap(); From 53c79e85f2a0acd4559c11eb01f27914ca1c5ccb Mon Sep 17 00:00:00 2001 From: marin postma Date: Sun, 24 Oct 2021 15:39:56 +0200 Subject: [PATCH 05/22] document errors --- milli/src/documents/builder.rs | 20 +++++++------------ milli/src/documents/mod.rs | 30 +++++++++++++++++++++++++--- milli/src/documents/serde.rs | 36 ++++++++++++++++++++++------------ 3 files changed, 57 insertions(+), 29 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 8c70910b5..a7b0c90a4 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -87,18 +87,13 @@ impl DocumentBatchBuilder { count: &mut self.count, }; - de.deserialize_any(&mut visitor).map_err(Error::JsonError)?; - - Ok(()) + de.deserialize_any(&mut visitor).map_err(Error::JsonError)? } - /// Extends the builder with json documents from a reader. + /// Creates a builder from a reader of CSV documents. /// - /// This method can be only called once and is mutually exclusive with extend from json. This - /// is because the fields in a csv are always guaranteed to come in order, and permits some - /// optimizations. - /// - /// From csv takes care to call finish in the end. + /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform + /// optimisations, and extending from another CSV is not allowed. pub fn from_csv(reader: R, writer: W) -> Result { let mut this = Self::new(writer)?; @@ -108,8 +103,7 @@ impl DocumentBatchBuilder { let mut records = csv::Reader::from_reader(reader); let headers = records - .headers() - .unwrap() + .headers()? .into_iter() .map(parse_csv_header) .map(|(k, t)| (this.index.insert(&k), t)) @@ -123,11 +117,11 @@ impl DocumentBatchBuilder { let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { - AllowedType::Number => value.parse::().map(Value::from).unwrap(), + AllowedType::Number => value.parse::().map(Value::from)?, AllowedType::String => Value::String(value.to_string()), }; - serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value).unwrap(); + serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; writer.insert(*fid, &this.value_buffer)?; this.value_buffer.clear(); } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 8a8b87794..bbc114480 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,7 +7,9 @@ mod builder; mod reader; mod serde; -use std::{fmt, io}; +use std::num::ParseFloatError; +use std::io; +use std::fmt::{self, Debug}; use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; @@ -81,14 +83,22 @@ impl io::Write for ByteCounter { #[derive(Debug)] pub enum Error { + ParseFloat(std::num::ParseFloatError), InvalidDocumentFormat, Custom(String), JsonError(serde_json::Error), + CsvError(csv::Error), Serialize(bincode::Error), Io(io::Error), DocumentTooLarge, } +impl From for Error { + fn from(e: csv::Error) -> Self { + Self::CsvError(e) + } +} + impl From for Error { fn from(other: io::Error) -> Self { Self::Io(other) @@ -101,15 +111,29 @@ impl From for Error { } } +impl From for Error { + fn from(other: serde_json::Error) -> Self { + Self::JsonError(other) + } +} + +impl From for Error { + fn from(other: ParseFloatError) -> Self { + Self::ParseFloat(other) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { + Error::ParseFloat(e) => write!(f, "{}", e), Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), - Error::Io(e) => e.fmt(f), + Error::Io(e) => write!(f, "{}", e), Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), - Error::Serialize(e) => e.fmt(f), + Error::Serialize(e) => write!(f, "{}", e), + Error::CsvError(e) => write!(f, "{}", e), } } } diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 2466ed373..4dfdca2c7 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -11,9 +11,19 @@ use serde::de::SeqAccess; use serde::de::Visitor; use serde_json::Value; +use super::Error; use super::{ByteCounter, DocumentsBatchIndex}; use crate::FieldId; +macro_rules! tri { + ($e:expr) => { + match $e { + Ok(r) => r, + Err(e) => return Ok(Err(e.into())), + } + }; +} + struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { @@ -36,8 +46,8 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { Ok(self.0.insert(v)) } - fn expecting(&self, _formatter: &mut fmt::Formatter) -> fmt::Result { - todo!() + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a string") } } @@ -64,22 +74,22 @@ pub struct DocumentVisitor<'a, W> { impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { /// This Visitor value is nothing, since it write the value to a file. - type Value = (); + type Value = Result<(), Error>; fn visit_seq(self, mut seq: A) -> Result where A: SeqAccess<'de>, { - while let Some(_) = seq.next_element_seed(&mut *self)? { } + while let Some(v) = seq.next_element_seed(&mut *self)? { tri!(v) } - Ok(()) + Ok(Ok(())) } fn visit_map(self, mut map: A) -> Result where A: MapAccess<'de>, { - while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer).unwrap() { + while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? { self.values.insert(key, value); } @@ -88,19 +98,19 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { for (key, value) in self.values.iter() { self.value_buffer.clear(); // This is guaranteed to work - serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value).unwrap(); - obkv.insert(*key, &self.value_buffer).unwrap(); + tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); + tri!(obkv.insert(*key, &self.value_buffer)); } - let reader = obkv.into_inner().unwrap().into_inner(); + let reader = tri!(obkv.into_inner()).into_inner(); - self.inner.write_u32::(reader.len() as u32).unwrap(); - self.inner.write_all(reader).unwrap(); + tri!(self.inner.write_u32::(reader.len() as u32)); + tri!(self.inner.write_all(reader)); *self.count += 1; self.values.clear(); - Ok(()) + Ok(Ok(())) } fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -111,7 +121,7 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> where W: Write, { - type Value = (); + type Value = Result<(), Error>; fn deserialize(self, deserializer: D) -> Result where From 430e9b13d3523a17a5354a7a2f8d955a3097a008 Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 09:48:53 +0200 Subject: [PATCH 06/22] add csv builder tests --- milli/src/documents/builder.rs | 324 ++++++++++++++++-- milli/src/documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 2 +- 3 files changed, 297 insertions(+), 31 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index a7b0c90a4..d54d9639c 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -54,12 +54,7 @@ impl DocumentBatchBuilder { /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. pub fn finish(self) -> Result<(), Error> { - let Self { - inner: ByteCounter { mut writer, count: offset }, - index, - count, - .. - } = self; + let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; let meta = DocumentsMetadata { count, index }; @@ -73,7 +68,6 @@ impl DocumentBatchBuilder { Ok(()) } - /// Extends the builder with json documents from a reader. pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { let mut de = serde_json::Deserializer::from_reader(reader); @@ -95,7 +89,6 @@ impl DocumentBatchBuilder { /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform /// optimisations, and extending from another CSV is not allowed. pub fn from_csv(reader: R, writer: W) -> Result { - let mut this = Self::new(writer)?; // Ensure that this is the first and only addition made with this builder debug_assert!(this.index.is_empty()); @@ -112,28 +105,24 @@ impl DocumentBatchBuilder { let records = records.into_records(); for record in records { - match record { - Ok(record) => { - let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); - for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { - let value = match ty { - AllowedType::Number => value.parse::().map(Value::from)?, - AllowedType::String => Value::String(value.to_string()), - }; + let record = record?; + let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); + for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { + let value = match ty { + AllowedType::Number => value.parse::().map(Value::from)?, + AllowedType::String => Value::String(value.to_string()), + }; - serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; - writer.insert(*fid, &this.value_buffer)?; - this.value_buffer.clear(); - } - - this.inner.write_u32::(this.obkv_buffer.len() as u32)?; - this.inner.write_all(&this.obkv_buffer)?; - - this.obkv_buffer.clear(); - this.count += 1; - }, - Err(_) => panic!(), + serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; + writer.insert(*fid, &this.value_buffer)?; + this.value_buffer.clear(); } + + this.inner.write_u32::(this.obkv_buffer.len() as u32)?; + this.inner.write_all(&this.obkv_buffer)?; + + this.obkv_buffer.clear(); + this.count += 1; } Ok(this) @@ -162,10 +151,25 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) { mod test { use std::io::Cursor; + use serde_json::{json, Map}; + use crate::documents::DocumentBatchReader; use super::*; + fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { + let mut map = Map::new(); + + for (fid, value) in obkv.iter() { + let field_name = index.name(fid).unwrap().clone(); + let value: Value = serde_json::from_slice(value).unwrap(); + + map.insert(field_name, value); + } + + Value::Object(map) + } + #[test] fn add_single_documents_json() { let mut cursor = Cursor::new(Vec::new()); @@ -247,7 +251,8 @@ mod test { let csv = "id:number,field:string\n1,hello!\n2,blabla"; - let builder = DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); + let builder = + DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); builder.finish().unwrap(); cursor.set_position(0); @@ -263,4 +268,265 @@ mod test { assert!(reader.next_document_with_index().unwrap().is_none()); } + + #[test] + fn simple_csv_document() { + let documents = r#"city,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn coma_in_field() { + let documents = r#"city,country,pop +"Boston","United, States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United, States", + "pop": "4628910", + }) + ); + } + + #[test] + fn quote_in_field() { + let documents = r#"city,country,pop +"Boston","United"" States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United\" States", + "pop": "4628910", + }) + ); + } + + #[test] + fn integer_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.0, + }) + ); + } + + #[test] + fn float_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910.01""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.01, + }) + ); + } + + #[test] + fn several_colon_in_header() { + let documents = r#"city:love:string,country:state,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:love": "Boston", + "country:state": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn ending_by_colon_in_header() { + let documents = r#"city:,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn starting_by_colon_in_header() { + let documents = r#":city,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + ":city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[ignore] + #[test] + fn starting_by_colon_in_header2() { + let documents = r#":string,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + + assert!(reader.next_document_with_index().is_err()); + } + + #[test] + fn double_colon_in_header() { + let documents = r#"city::string,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn bad_type_in_header() { + let documents = r#"city,country:number,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + } + + #[test] + fn bad_column_count1() { + let documents = r#"city,country,pop +"Boston","United States","4628910", "too much""#; + + let mut buf = Vec::new(); + assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + } + + #[test] + fn bad_column_count2() { + let documents = r#"city,country,pop +"Boston","United States""#; + + let mut buf = Vec::new(); + assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index bbc114480..b61a3326b 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -47,7 +47,7 @@ impl DocumentsBatchIndex { self.0.iter() } - pub fn get_id(&self, id: FieldId) -> Option<&String> { + pub fn name(&self, id: FieldId) -> Option<&String> { self.0.get_by_left(&id) } } diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 5af1eda72..eac60effb 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -179,7 +179,7 @@ impl Transform<'_, '_> { if !self.autogenerate_docids { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.get_id(key).cloned(); + let key = addition_index.name(key).cloned(); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { From 3fcccc31b5885029499056adaaf42853ab45d1ce Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 10:26:28 +0200 Subject: [PATCH 07/22] add document builder example --- milli/src/documents/builder.rs | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index d54d9639c..144dcdfa9 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -16,7 +16,20 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// format used by milli. /// /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to -/// iterate other the documents. +/// iterate over the documents. +/// +/// ## example: +/// ``` +/// use milli::documents::DocumentBatchBuilder; +/// use serde_json::json; +/// use std::io::Cursor; +/// +/// let json = r##"{"id": 1, "name": "foo"}"##; +/// let mut writer = Cursor::new(Vec::new()); +/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); +/// builder.extend_from_json(Cursor::new(json.as_bytes())).unwrap(); +/// builder.finish().unwrap(); +/// ``` pub struct DocumentBatchBuilder { inner: ByteCounter, index: DocumentsBatchIndex, From 679fe18b17f7f1458cf336f8299be20ad2aca98f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 25 Oct 2021 11:52:17 +0200 Subject: [PATCH 08/22] Update version for the next release (v0.19.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- search/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index b3a15f100..341b8eb7c 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index a7af6fb9b..310388e01 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 2701c36d7..726fa9c5f 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 01309797d..12f474f12 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.18.1" +version = "0.19.0" authors = ["Kerollmops "] edition = "2018" diff --git a/search/Cargo.toml b/search/Cargo.toml index dd74ad7b6..d9441984e 100644 --- a/search/Cargo.toml +++ b/search/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "search" -version = "0.18.1" +version = "0.19.0" authors = ["Clément Renault "] edition = "2018" From 208903dddea1567f2c2c3afe719f0f512b9d0fe4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Mon, 25 Oct 2021 11:58:00 +0200 Subject: [PATCH 09/22] Revert "Replacing pest with nom " --- milli/Cargo.toml | 3 +- milli/src/error.rs | 11 +- milli/src/lib.rs | 3 + milli/src/search/facet/filter_condition.rs | 708 ++++++++++++++++++++- milli/src/search/facet/filter_parser.rs | 622 ------------------ milli/src/search/facet/grammar.pest | 33 + milli/src/search/facet/mod.rs | 5 +- milli/src/search/facet/parser.rs | 12 + milli/src/search/mod.rs | 3 +- 9 files changed, 736 insertions(+), 664 deletions(-) delete mode 100644 milli/src/search/facet/filter_parser.rs create mode 100644 milli/src/search/facet/grammar.pest create mode 100644 milli/src/search/facet/parser.rs diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 594cc60e0..3792204e9 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -39,7 +39,8 @@ tempfile = "3.2.0" uuid = { version = "0.8.2", features = ["v4"] } # facet filter parser -nom = "7.0.0" +pest = { git = "https://github.com/pest-parser/pest.git", rev = "51fd1d49f1041f7839975664ef71fe15c7dcaf67" } +pest_derive = "2.1.0" # documents words self-join itertools = "0.10.0" diff --git a/milli/src/error.rs b/milli/src/error.rs index c0ce101c8..1f1cc5264 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,6 +7,7 @@ use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; use serde_json::{Map, Value}; +use crate::search::ParserRule; use crate::{CriterionError, DocumentId, FieldId, SortError}; pub type Object = Map; @@ -58,9 +59,9 @@ pub enum UserError { DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, + InvalidFilter(pest::error::Error), + InvalidFilterAttribute(pest::error::Error), InvalidGeoField { document_id: Value, object: Value }, - InvalidFilter { input: String }, - InvalidSortName { name: String }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, @@ -207,7 +208,6 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InvalidFilter { input } => write!(f, "parser error {}", input), Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), @@ -220,6 +220,7 @@ impl fmt::Display for UserError { name_list ) } + Self::InvalidFilter(error) => error.fmt(f), Self::InvalidGeoField { document_id, object } => write!( f, "the document with the id: {} contains an invalid _geo field: {}", @@ -235,9 +236,7 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco json ) } - Self::InvalidSortName { name } => { - write!(f, "Invalid syntax for the sort parameter: {}", name) - } + Self::InvalidFilterAttribute(error) => error.fmt(f), Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 6fe5947f5..781cedb2c 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -1,3 +1,6 @@ +#[macro_use] +extern crate pest_derive; + #[macro_use] pub mod documents; diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 4fedeee69..f1055b2f8 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -1,20 +1,60 @@ +use std::collections::HashSet; use std::fmt::Debug; use std::ops::Bound::{self, Excluded, Included}; +use std::result::Result as StdResult; +use std::str::FromStr; use either::Either; use heed::types::DecodeIgnore; +use itertools::Itertools; use log::debug; -use nom::error::{convert_error, VerboseError}; +use pest::error::{Error as PestError, ErrorVariant}; +use pest::iterators::{Pair, Pairs}; +use pest::Parser; use roaring::RoaringBitmap; use self::FilterCondition::*; -use super::filter_parser::{Operator, ParseContext}; +use self::Operator::*; +use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; -use crate::error::{Error, UserError}; +use crate::error::UserError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; -use crate::{distance_between_two_points, CboRoaringBitmapCodec, FieldId, Index, Result}; +use crate::{ + distance_between_two_points, CboRoaringBitmapCodec, FieldId, FieldsIdsMap, Index, Result, +}; + +#[derive(Debug, Clone, PartialEq)] +pub enum Operator { + GreaterThan(f64), + GreaterThanOrEqual(f64), + Equal(Option, String), + NotEqual(Option, String), + LowerThan(f64), + LowerThanOrEqual(f64), + Between(f64, f64), + GeoLowerThan([f64; 2], f64), + GeoGreaterThan([f64; 2], f64), +} + +impl Operator { + /// This method can return two operations in case it must express + /// an OR operation for the between case (i.e. `TO`). + fn negate(self) -> (Self, Option) { + match self { + GreaterThan(n) => (LowerThanOrEqual(n), None), + GreaterThanOrEqual(n) => (LowerThan(n), None), + Equal(n, s) => (NotEqual(n, s), None), + NotEqual(n, s) => (Equal(n, s), None), + LowerThan(n) => (GreaterThanOrEqual(n), None), + LowerThanOrEqual(n) => (GreaterThan(n), None), + Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), + GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), + GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), + } + } +} #[derive(Debug, Clone, PartialEq)] pub enum FilterCondition { @@ -36,7 +76,7 @@ impl FilterCondition { A: AsRef, B: AsRef, { - let mut ands: Option = None; + let mut ands = None; for either in array { match either { @@ -77,23 +117,41 @@ impl FilterCondition { ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - let ctx = - ParseContext { fields_ids_map: &fields_ids_map, filterable_fields: &filterable_fields }; - match ctx.parse_expression::>(expression) { - Ok((_, fc)) => Ok(fc), - Err(e) => { - let ve = match e { - nom::Err::Error(x) => x, - nom::Err::Failure(x) => x, - _ => unreachable!(), - }; - Err(Error::UserError(UserError::InvalidFilter { - input: convert_error(expression, ve).to_string(), - })) - } - } + let lexed = + FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; + FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } - pub fn negate(self) -> FilterCondition { + + fn from_pairs( + fim: &FieldsIdsMap, + ff: &HashSet, + expression: Pairs, + ) -> Result { + PREC_CLIMBER.climb( + expression, + |pair: Pair| match pair.as_rule() { + Rule::greater => Ok(Self::greater_than(fim, ff, pair)?), + Rule::geq => Ok(Self::greater_than_or_equal(fim, ff, pair)?), + Rule::eq => Ok(Self::equal(fim, ff, pair)?), + Rule::neq => Ok(Self::equal(fim, ff, pair)?.negate()), + Rule::leq => Ok(Self::lower_than_or_equal(fim, ff, pair)?), + Rule::less => Ok(Self::lower_than(fim, ff, pair)?), + Rule::between => Ok(Self::between(fim, ff, pair)?), + Rule::geo_radius => Ok(Self::geo_radius(fim, ff, pair)?), + Rule::not => Ok(Self::from_pairs(fim, ff, pair.into_inner())?.negate()), + Rule::prgm => Self::from_pairs(fim, ff, pair.into_inner()), + Rule::term => Self::from_pairs(fim, ff, pair.into_inner()), + _ => unreachable!(), + }, + |lhs: Result, op: Pair, rhs: Result| match op.as_rule() { + Rule::or => Ok(Or(Box::new(lhs?), Box::new(rhs?))), + Rule::and => Ok(And(Box::new(lhs?), Box::new(rhs?))), + _ => unreachable!(), + }, + ) + } + + fn negate(self) -> FilterCondition { match self { Operator(fid, op) => match op.negate() { (op, None) => Operator(fid, op), @@ -104,6 +162,189 @@ impl FilterCondition { Empty => Empty, } } + + fn geo_radius( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + if !filterable_fields.contains("_geo") { + return Err(UserError::InvalidFilterAttribute(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `_geo` is not filterable, available filterable attributes are: {}", + filterable_fields.iter().join(", "), + ), + }, + item.as_span(), + )))?; + } + let mut items = item.into_inner(); + let fid = match fields_ids_map.id("_geo") { + Some(fid) => fid, + None => return Ok(Empty), + }; + let parameters_item = items.next().unwrap(); + // We don't need more than 3 parameters, but to handle errors correctly we are still going + // to extract the first 4 parameters + let param_span = parameters_item.as_span(); + let parameters = parameters_item + .into_inner() + .take(4) + .map(|param| (param.clone(), param.as_span())) + .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span))) + .collect::, _>>() + .map_err(UserError::InvalidFilter)?; + if parameters.len() != 3 { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), + }, + // we want to point to the last parameters and if there was no parameters we + // point to the parenthesis + parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), + )))?; + } + let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); + if !(-90.0..=90.0).contains(&lat.0) { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("Latitude must be contained between -90 and 90 degrees."), + }, + lat.1.clone(), + )))?; + } else if !(-180.0..=180.0).contains(&lng.0) { + return Err(UserError::InvalidFilter(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!("Longitude must be contained between -180 and 180 degrees."), + }, + lng.1.clone(), + )))?; + } + Ok(Operator(fid, GeoLowerThan([lat.0, lng.0], distance))) + } + + fn between( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let (lresult, _) = pest_parse(items.next().unwrap()); + let (rresult, _) = pest_parse(items.next().unwrap()); + + let lvalue = lresult.map_err(UserError::InvalidFilter)?; + let rvalue = rresult.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, Between(lvalue, rvalue))) + } + + fn equal( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, svalue) = pest_parse(value); + + let svalue = svalue.to_lowercase(); + Ok(Operator(fid, Equal(result.ok(), svalue))) + } + + fn greater_than( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, GreaterThan(value))) + } + + fn greater_than_or_equal( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, GreaterThanOrEqual(value))) + } + + fn lower_than( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, LowerThan(value))) + } + + fn lower_than_or_equal( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + item: Pair, + ) -> Result { + let mut items = item.into_inner(); + let fid = match field_id(fields_ids_map, filterable_fields, &mut items) + .map_err(UserError::InvalidFilterAttribute)? + { + Some(fid) => fid, + None => return Ok(Empty), + }; + + let value = items.next().unwrap(); + let (result, _svalue) = pest_parse(value); + let value = result.map_err(UserError::InvalidFilter)?; + + Ok(Operator(fid, LowerThanOrEqual(value))) + } } impl FilterCondition { @@ -227,9 +468,9 @@ impl FilterCondition { // as the facets values are all in the same database and prefixed by the // field id and the level. let (left, right) = match operator { - Operator::GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), - Operator::GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), - Operator::Equal(number, string) => { + GreaterThan(val) => (Excluded(*val), Included(f64::MAX)), + GreaterThanOrEqual(val) => (Included(*val), Included(f64::MAX)), + Equal(number, string) => { let (_original_value, string_docids) = strings_db.get(rtxn, &(field_id, &string))?.unwrap_or_default(); let number_docids = match number { @@ -251,23 +492,23 @@ impl FilterCondition { }; return Ok(string_docids | number_docids); } - Operator::NotEqual(number, string) => { + NotEqual(number, string) => { let all_numbers_ids = if number.is_some() { index.number_faceted_documents_ids(rtxn, field_id)? } else { RoaringBitmap::new() }; let all_strings_ids = index.string_faceted_documents_ids(rtxn, field_id)?; - let operator = Operator::Equal(*number, string.clone()); + let operator = Equal(*number, string.clone()); let docids = Self::evaluate_operator( rtxn, index, numbers_db, strings_db, field_id, &operator, )?; return Ok((all_numbers_ids | all_strings_ids) - docids); } - Operator::LowerThan(val) => (Included(f64::MIN), Excluded(*val)), - Operator::LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), - Operator::Between(left, right) => (Included(*left), Included(*right)), - Operator::GeoLowerThan(base_point, distance) => { + LowerThan(val) => (Included(f64::MIN), Excluded(*val)), + LowerThanOrEqual(val) => (Included(f64::MIN), Included(*val)), + Between(left, right) => (Included(*left), Included(*right)), + GeoLowerThan(base_point, distance) => { let rtree = match index.geo_rtree(rtxn)? { Some(rtree) => rtree, None => return Ok(RoaringBitmap::new()), @@ -283,14 +524,14 @@ impl FilterCondition { return Ok(result); } - Operator::GeoGreaterThan(point, distance) => { + GeoGreaterThan(point, distance) => { let result = Self::evaluate_operator( rtxn, index, numbers_db, strings_db, field_id, - &Operator::GeoLowerThan(point.clone(), *distance), + &GeoLowerThan(point.clone(), *distance), )?; let geo_faceted_doc_ids = index.geo_faceted_documents_ids(rtxn)?; return Ok(geo_faceted_doc_ids - result); @@ -344,3 +585,406 @@ impl FilterCondition { } } } + +/// Retrieve the field id base on the pest value. +/// +/// Returns an error if the given value is not filterable. +/// +/// Returns Ok(None) if the given value is filterable, but is not yet ascociated to a field_id. +/// +/// The pest pair is simply a string associated with a span, a location to highlight in +/// the error message. +fn field_id( + fields_ids_map: &FieldsIdsMap, + filterable_fields: &HashSet, + items: &mut Pairs, +) -> StdResult, PestError> { + // lexing ensures that we at least have a key + let key = items.next().unwrap(); + if key.as_rule() == Rule::reserved { + let message = match key.as_str() { + key if key.starts_with("_geoPoint") => { + format!( + "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \ + Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", + ) + } + key @ "_geo" => { + format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression. \ + Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", + key + ) + } + key => format!( + "`{}` is a reserved keyword and thus can't be used as a filter expression.", + key + ), + }; + return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span())); + } + + if !filterable_fields.contains(key.as_str()) { + return Err(PestError::new_from_span( + ErrorVariant::CustomError { + message: format!( + "attribute `{}` is not filterable, available filterable attributes are: {}.", + key.as_str(), + filterable_fields.iter().join(", "), + ), + }, + key.as_span(), + )); + } + + Ok(fields_ids_map.id(key.as_str())) +} + +/// Tries to parse the pest pair into the type `T` specified, always returns +/// the original string that we tried to parse. +/// +/// Returns the parsing error associated with the span if the conversion fails. +fn pest_parse(pair: Pair) -> (StdResult>, String) +where + T: FromStr, + T::Err: ToString, +{ + let result = match pair.as_str().parse::() { + Ok(value) => Ok(value), + Err(e) => Err(PestError::::new_from_span( + ErrorVariant::CustomError { message: e.to_string() }, + pair.as_span(), + )), + }; + + (result, pair.as_str().to_string()) +} + +#[cfg(test)] +mod tests { + use big_s::S; + use heed::EnvOpenOptions; + use maplit::hashset; + + use super::*; + use crate::update::Settings; + + #[test] + fn string() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("channel"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { S("channel") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); + let expected = Operator(0, Operator::Equal(None, S("ponce"))); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); + let expected = Operator(0, Operator::NotEqual(None, S("ponce"))); + assert_eq!(condition, expected); + } + + #[test] + fn number() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut map = index.fields_ids_map(&wtxn).unwrap(); + map.insert("timestamp"); + index.put_fields_ids_map(&mut wtxn, &map).unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { "timestamp".into() }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); + let expected = Operator(0, Between(22.0, 44.0)); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); + let expected = + Or(Box::new(Operator(0, LowerThan(22.0))), Box::new(Operator(0, GreaterThan(44.0)))); + assert_eq!(condition, expected); + } + + #[test] + fn parentheses() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = Or( + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(And( + Box::new(Operator(1, Between(22.0, 44.0))), + Box::new(Operator(0, Operator::NotEqual(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + + let condition = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", + ) + .unwrap(); + let expected = Or( + Box::new(Operator(0, Operator::Equal(None, S("gotaga")))), + Box::new(Or( + Box::new(Or( + Box::new(Operator(1, LowerThan(22.0))), + Box::new(Operator(1, GreaterThan(44.0))), + )), + Box::new(Operator(0, Operator::Equal(None, S("ponce")))), + )), + ); + assert_eq!(condition, expected); + } + + #[test] + fn reserved_field_names() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let rtxn = index.read_txn().unwrap(); + + let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err(); + assert!(error + .to_string() + .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + + let error = + FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."), + "{}", + error.to_string() + ); + + let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + + let error = + FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err(); + assert!(error + .to_string() + .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), + "{}", + error.to_string() + ); + } + + #[test] + fn geo_radius() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // _geo is not filterable + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("attribute `_geo` is not filterable, available filterable attributes are:"),); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + // basic test + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([12., 13.0005], 2000.)); + assert_eq!(condition, expected); + + // basic test with latitude and longitude at the max angle + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(90, 180, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([90., 180.], 2000.)); + assert_eq!(condition, expected); + + // basic test with latitude and longitude at the min angle + let condition = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90, -180, 2000)").unwrap(); + let expected = Operator(0, GeoLowerThan([-90., -180.], 2000.)); + assert_eq!(condition, expected); + + // test the negation of the GeoLowerThan + let condition = + FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); + let expected = Operator(0, GeoGreaterThan([50., 18.], 2000.500)); + assert_eq!(condition, expected); + + // composition of multiple operations + let condition = FilterCondition::from_str( + &rtxn, + &index, + "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", + ) + .unwrap(); + let expected = Or( + Box::new(And( + Box::new(Operator(0, GeoGreaterThan([1., 2.], 300.))), + Box::new(Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), + )), + Box::new(Operator(1, LowerThanOrEqual(10.))), + ); + assert_eq!(condition, expected); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have any parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius don't have enough parameters + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have too many parameters + let result = + FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude must be contained between -90 and 90 degrees.")); + + // georadius have a bad latitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Latitude must be contained between -90 and 90 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); + + // georadius have a bad longitude + let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); + assert!(result.is_err()); + let error = result.unwrap_err(); + assert!(error + .to_string() + .contains("Longitude must be contained between -180 and 180 degrees.")); + } + + #[test] + fn from_array() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + + // Set the filterable fields to be the channel. + let mut wtxn = index.write_txn().unwrap(); + let mut builder = Settings::new(&mut wtxn, &index, 0); + builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order + builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); + builder.execute(|_, _| ()).unwrap(); + wtxn.commit().unwrap(); + + // Test that the facet condition is correctly generated. + let rtxn = index.read_txn().unwrap(); + let condition = FilterCondition::from_array( + &rtxn, + &index, + vec![ + Either::Right("channel = gotaga"), + Either::Left(vec!["timestamp = 44", "channel != ponce"]), + ], + ) + .unwrap() + .unwrap(); + let expected = FilterCondition::from_str( + &rtxn, + &index, + "channel = gotaga AND (timestamp = 44 OR channel != ponce)", + ) + .unwrap(); + assert_eq!(condition, expected); + } +} diff --git a/milli/src/search/facet/filter_parser.rs b/milli/src/search/facet/filter_parser.rs deleted file mode 100644 index 4d8a54987..000000000 --- a/milli/src/search/facet/filter_parser.rs +++ /dev/null @@ -1,622 +0,0 @@ -use std::collections::HashSet; -use std::fmt::Debug; -use std::result::Result as StdResult; - -use nom::branch::alt; -use nom::bytes::complete::{tag, take_while1}; -use nom::character::complete::{char, multispace0}; -use nom::combinator::map; -use nom::error::{ContextError, ErrorKind, VerboseError}; -use nom::multi::{many0, separated_list1}; -use nom::sequence::{delimited, preceded, tuple}; -use nom::IResult; - -use self::Operator::*; -use super::FilterCondition; -use crate::{FieldId, FieldsIdsMap}; -#[derive(Debug, Clone, PartialEq)] -pub enum Operator { - GreaterThan(f64), - GreaterThanOrEqual(f64), - Equal(Option, String), - NotEqual(Option, String), - LowerThan(f64), - LowerThanOrEqual(f64), - Between(f64, f64), - GeoLowerThan([f64; 2], f64), - GeoGreaterThan([f64; 2], f64), -} - -impl Operator { - /// This method can return two operations in case it must express - /// an OR operation for the between case (i.e. `TO`). - pub fn negate(self) -> (Self, Option) { - match self { - GreaterThan(n) => (LowerThanOrEqual(n), None), - GreaterThanOrEqual(n) => (LowerThan(n), None), - Equal(n, s) => (NotEqual(n, s), None), - NotEqual(n, s) => (Equal(n, s), None), - LowerThan(n) => (GreaterThanOrEqual(n), None), - LowerThanOrEqual(n) => (GreaterThan(n), None), - Between(n, m) => (LowerThan(n), Some(GreaterThan(m))), - GeoLowerThan(point, distance) => (GeoGreaterThan(point, distance), None), - GeoGreaterThan(point, distance) => (GeoLowerThan(point, distance), None), - } - } -} - -pub trait FilterParserError<'a>: - nom::error::ParseError<&'a str> + ContextError<&'a str> + std::fmt::Debug -{ -} - -impl<'a> FilterParserError<'a> for VerboseError<&'a str> {} - -pub struct ParseContext<'a> { - pub fields_ids_map: &'a FieldsIdsMap, - pub filterable_fields: &'a HashSet, -} - -impl<'a> ParseContext<'a> { - fn parse_or(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, lhs) = self.parse_and(input)?; - let (input, ors) = many0(preceded(self.ws(tag("OR")), |c| Self::parse_or(self, c)))(input)?; - - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::Or(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - fn parse_and(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, lhs) = self.parse_not(input)?; - let (input, ors) = - many0(preceded(self.ws(tag("AND")), |c| Self::parse_and(self, c)))(input)?; - let expr = ors - .into_iter() - .fold(lhs, |acc, branch| FilterCondition::And(Box::new(acc), Box::new(branch))); - Ok((input, expr)) - } - - fn parse_not(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - alt(( - map( - preceded(alt((self.ws(tag("!")), self.ws(tag("NOT")))), |c| { - Self::parse_condition_expression(self, c) - }), - |e| e.negate(), - ), - |c| Self::parse_condition_expression(self, c), - ))(input) - } - - fn ws(&'a self, inner: F) -> impl FnMut(&'a str) -> IResult<&'a str, O, E> - where - F: Fn(&'a str) -> IResult<&'a str, O, E>, - E: FilterParserError<'a>, - { - delimited(multispace0, inner, multispace0) - } - - fn parse_simple_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let operator = alt((tag("<="), tag(">="), tag(">"), tag("="), tag("<"), tag("!="))); - let k = tuple((self.ws(|c| self.parse_key(c)), operator, self.ws(|c| self.parse_key(c))))( - input, - ); - let (input, (key, op, value)) = match k { - Ok(o) => o, - Err(e) => { - return Err(e); - } - }; - - let fid = self.parse_fid(input, key)?; - let r: StdResult>> = self.parse_numeric(value); - let k = match op { - "=" => FilterCondition::Operator(fid, Equal(r.ok(), value.to_string().to_lowercase())), - "!=" => { - FilterCondition::Operator(fid, NotEqual(r.ok(), value.to_string().to_lowercase())) - } - ">" | "<" | "<=" | ">=" => return self.parse_numeric_unary_condition(op, fid, value), - _ => unreachable!(), - }; - Ok((input, k)) - } - - fn parse_numeric(&'a self, input: &'a str) -> StdResult> - where - E: FilterParserError<'a>, - T: std::str::FromStr, - { - match input.parse::() { - Ok(n) => Ok(n), - Err(_) => { - return match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - } - } - } - - fn parse_numeric_unary_condition( - &'a self, - input: &'a str, - fid: u16, - value: &'a str, - ) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let numeric: f64 = self.parse_numeric(value)?; - let k = match input { - ">" => FilterCondition::Operator(fid, GreaterThan(numeric)), - "<" => FilterCondition::Operator(fid, LowerThan(numeric)), - "<=" => FilterCondition::Operator(fid, LowerThanOrEqual(numeric)), - ">=" => FilterCondition::Operator(fid, GreaterThanOrEqual(numeric)), - _ => unreachable!(), - }; - Ok((input, k)) - } - - fn parse_fid(&'a self, input: &'a str, key: &'a str) -> StdResult> - where - E: FilterParserError<'a>, - { - let error = match input.chars().nth(0) { - Some(ch) => Err(nom::Err::Failure(E::from_char(input, ch))), - None => Err(nom::Err::Failure(E::from_error_kind(input, ErrorKind::Eof))), - }; - if !self.filterable_fields.contains(key) { - return error; - } - match self.fields_ids_map.id(key) { - Some(fid) => Ok(fid), - None => error, - } - } - - fn parse_range_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let (input, (key, from, _, to)) = tuple(( - self.ws(|c| self.parse_key(c)), - self.ws(|c| self.parse_key(c)), - tag("TO"), - self.ws(|c| self.parse_key(c)), - ))(input)?; - - let fid = self.parse_fid(input, key)?; - let numeric_from: f64 = self.parse_numeric(from)?; - let numeric_to: f64 = self.parse_numeric(to)?; - let res = FilterCondition::Operator(fid, Between(numeric_from, numeric_to)); - - Ok((input, res)) - } - - fn parse_geo_radius(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let err_msg_args_incomplete= "_geoRadius. The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"; - let err_msg_latitude_invalid = - "_geoRadius. Latitude must be contained between -90 and 90 degrees."; - - let err_msg_longitude_invalid = - "_geoRadius. Longitude must be contained between -180 and 180 degrees."; - - let (input, args): (&str, Vec<&str>) = match preceded( - tag("_geoRadius"), - delimited( - char('('), - separated_list1(tag(","), self.ws(|c| self.parse_value::(c))), - char(')'), - ), - )(input) - { - Ok(e) => e, - Err(_e) => { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_args_incomplete, - E::from_char(input, '('), - ))); - } - }; - - if args.len() != 3 { - let e = E::from_char(input, '('); - return Err(nom::Err::Failure(E::add_context(input, err_msg_args_incomplete, e))); - } - let lat = self.parse_numeric(args[0])?; - let lng = self.parse_numeric(args[1])?; - let dis = self.parse_numeric(args[2])?; - - let fid = match self.fields_ids_map.id("_geo") { - Some(fid) => fid, - None => return Ok((input, FilterCondition::Empty)), - }; - - if !(-90.0..=90.0).contains(&lat) { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_latitude_invalid, - E::from_char(input, '('), - ))); - } else if !(-180.0..=180.0).contains(&lng) { - return Err(nom::Err::Failure(E::add_context( - input, - err_msg_longitude_invalid, - E::from_char(input, '('), - ))); - } - - let res = FilterCondition::Operator(fid, GeoLowerThan([lat, lng], dis)); - Ok((input, res)) - } - - fn parse_condition(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - let l1 = |c| self.parse_simple_condition(c); - let l2 = |c| self.parse_range_condition(c); - let l3 = |c| self.parse_geo_radius(c); - alt((l1, l2, l3))(input) - } - - fn parse_condition_expression(&'a self, input: &'a str) -> IResult<&str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - alt(( - delimited(self.ws(char('(')), |c| Self::parse_expression(self, c), self.ws(char(')'))), - |c| Self::parse_condition(self, c), - ))(input) - } - - fn parse_key(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> - where - E: FilterParserError<'a>, - { - let key = |input| take_while1(Self::is_key_component)(input); - alt((key, delimited(char('"'), key, char('"'))))(input) - } - - fn parse_value(&'a self, input: &'a str) -> IResult<&'a str, &'a str, E> - where - E: FilterParserError<'a>, - { - let key = |input| take_while1(Self::is_key_component)(input); - alt((key, delimited(char('"'), key, char('"'))))(input) - } - - fn is_key_component(c: char) -> bool { - c.is_alphanumeric() || ['_', '-', '.'].contains(&c) - } - - pub fn parse_expression(&'a self, input: &'a str) -> IResult<&'a str, FilterCondition, E> - where - E: FilterParserError<'a>, - { - self.parse_or(input) - } -} - -#[cfg(test)] -mod tests { - use big_s::S; - use either::Either; - use heed::EnvOpenOptions; - use maplit::hashset; - - use super::*; - use crate::update::Settings; - use crate::Index; - - #[test] - fn string() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("channel"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { S("channel") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel = Ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::Equal(None, S("ponce"))); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "channel != ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT channel = ponce").unwrap(); - let expected = FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce"))); - assert_eq!(condition, expected); - } - - #[test] - fn number() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut map = index.fields_ids_map(&wtxn).unwrap(); - map.insert("timestamp"); - index.put_fields_ids_map(&mut wtxn, &map).unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_filterable_fields(hashset! { "timestamp".into() }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Operator(0, Between(22.0, 44.0)); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str(&rtxn, &index, "NOT timestamp 22 TO 44").unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, LowerThan(22.0))), - Box::new(FilterCondition::Operator(0, GreaterThan(44.0))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn compare() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp"), S("id")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") ,S("id")}); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "channel < 20").unwrap(); - let expected = FilterCondition::Operator(0, LowerThan(20.0)); - assert_eq!(condition, expected); - - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str(&rtxn, &index, "id < 200").unwrap(); - let expected = FilterCondition::Operator(2, LowerThan(200.0)); - assert_eq!(condition, expected); - } - - #[test] - fn parentheses() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(1, Between(22.0, 44.0))), - Box::new(FilterCondition::Operator(0, Operator::NotEqual(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - - let condition = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga OR NOT (timestamp 22 TO 44 AND channel != ponce)", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("gotaga")))), - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Or( - Box::new(FilterCondition::Operator(1, LowerThan(22.0))), - Box::new(FilterCondition::Operator(1, GreaterThan(44.0))), - )), - Box::new(FilterCondition::Operator(0, Operator::Equal(None, S("ponce")))), - )), - ); - assert_eq!(condition, expected); - } - - #[test] - fn from_array() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("channel"), S("timestamp")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("channel"), S("timestamp") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - // Test that the facet condition is correctly generated. - let rtxn = index.read_txn().unwrap(); - let condition = FilterCondition::from_array( - &rtxn, - &index, - vec![ - Either::Right("channel = gotaga"), - Either::Left(vec!["timestamp = 44", "channel != ponce"]), - ], - ) - .unwrap() - .unwrap(); - let expected = FilterCondition::from_str( - &rtxn, - &index, - "channel = gotaga AND (timestamp = 44 OR channel != ponce)", - ) - .unwrap(); - assert_eq!(condition, expected); - } - #[test] - fn geo_radius() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - // basic test - let condition = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 13.0005, 2000)").unwrap(); - let expected = FilterCondition::Operator(0, GeoLowerThan([12., 13.0005], 2000.)); - assert_eq!(condition, expected); - - // test the negation of the GeoLowerThan - let condition = - FilterCondition::from_str(&rtxn, &index, "NOT _geoRadius(50, 18, 2000.500)").unwrap(); - let expected = FilterCondition::Operator(0, GeoGreaterThan([50., 18.], 2000.500)); - assert_eq!(condition, expected); - - // composition of multiple operations - let condition = FilterCondition::from_str( - &rtxn, - &index, - "(NOT _geoRadius(1, 2, 300) AND _geoRadius(1.001, 2.002, 1000.300)) OR price <= 10", - ) - .unwrap(); - let expected = FilterCondition::Or( - Box::new(FilterCondition::And( - Box::new(FilterCondition::Operator(0, GeoGreaterThan([1., 2.], 300.))), - Box::new(FilterCondition::Operator(0, GeoLowerThan([1.001, 2.002], 1000.300))), - )), - Box::new(FilterCondition::Operator(1, LowerThanOrEqual(10.))), - ); - assert_eq!(condition, expected); - } - - #[test] - fn geo_radius_error() { - let path = tempfile::tempdir().unwrap(); - let mut options = EnvOpenOptions::new(); - options.map_size(10 * 1024 * 1024); // 10 MB - let index = Index::new(options, &path).unwrap(); - - // Set the filterable fields to be the channel. - let mut wtxn = index.write_txn().unwrap(); - let mut builder = Settings::new(&mut wtxn, &index, 0); - builder.set_searchable_fields(vec![S("_geo"), S("price")]); // to keep the fields order - builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); - builder.execute(|_, _| ()).unwrap(); - wtxn.commit().unwrap(); - - let rtxn = index.read_txn().unwrap(); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have any parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius don't have enough parameters - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - // georadius have too many parameters - let result = - FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); - - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); - - // georadius have a bad latitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-90.0000001, 150, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Latitude must be contained between -90 and 90 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 250, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - - // georadius have a bad longitude - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-10, 180.000001, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("Longitude must be contained between -180 and 180 degrees.")); - } -} diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest new file mode 100644 index 000000000..8bfdeb667 --- /dev/null +++ b/milli/src/search/facet/grammar.pest @@ -0,0 +1,33 @@ +key = _{reserved | quoted | word } +value = _{quoted | word } +quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } +string = {char*} +word = ${(LETTER | NUMBER | "_" | "-" | ".")+} + +char = _{ !(PEEK | "\\") ~ ANY + | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") + | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} + +reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" } +// we deliberately choose to allow empty parameters to generate more specific error message later +parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} +condition = _{between | eq | greater | less | geq | leq | neq} +between = {key ~ value ~ "TO" ~ value} +geq = {key ~ ">=" ~ value} +leq = {key ~ "<=" ~ value} +neq = {key ~ "!=" ~ value} +eq = {key ~ "=" ~ value} +greater = {key ~ ">" ~ value} +less = {key ~ "<" ~ value} +geo_radius = {"_geoRadius" ~ parameters } + +prgm = {SOI ~ expr ~ EOI} +expr = _{ ( term ~ (operation ~ term)* ) } +term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } +operation = _{ and | or } +and = {"AND"} +or = {"OR"} + +not = {"NOT" ~ term} + +WHITESPACE = _{ " " } diff --git a/milli/src/search/facet/mod.rs b/milli/src/search/facet/mod.rs index 3efa0262f..ddf710e32 100644 --- a/milli/src/search/facet/mod.rs +++ b/milli/src/search/facet/mod.rs @@ -1,10 +1,11 @@ pub use self::facet_distribution::FacetDistribution; pub use self::facet_number::{FacetNumberIter, FacetNumberRange, FacetNumberRevRange}; pub use self::facet_string::FacetStringIter; -pub use self::filter_condition::FilterCondition; +pub use self::filter_condition::{FilterCondition, Operator}; +pub(crate) use self::parser::Rule as ParserRule; mod facet_distribution; mod facet_number; mod facet_string; mod filter_condition; -mod filter_parser; +mod parser; diff --git a/milli/src/search/facet/parser.rs b/milli/src/search/facet/parser.rs new file mode 100644 index 000000000..1bff27cfb --- /dev/null +++ b/milli/src/search/facet/parser.rs @@ -0,0 +1,12 @@ +use once_cell::sync::Lazy; +use pest::prec_climber::{Assoc, Operator, PrecClimber}; + +pub static PREC_CLIMBER: Lazy> = Lazy::new(|| { + use Assoc::*; + use Rule::*; + pest::prec_climber::PrecClimber::new(vec![Operator::new(or, Left), Operator::new(and, Left)]) +}); + +#[derive(Parser)] +#[grammar = "search/facet/grammar.pest"] +pub struct FilterParser; diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index 9b76ca851..bec059d46 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -14,7 +14,8 @@ use meilisearch_tokenizer::{Analyzer, AnalyzerConfig}; use once_cell::sync::Lazy; use roaring::bitmap::RoaringBitmap; -pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition}; +pub(crate) use self::facet::ParserRule; +pub use self::facet::{FacetDistribution, FacetNumberIter, FilterCondition, Operator}; pub use self::matching_words::MatchingWords; use self::query_tree::QueryTreeBuilder; use crate::error::UserError; From f9445c1d9059a77274f70508e2d0c896d9404bbb Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 16:07:57 +0200 Subject: [PATCH 10/22] return float parsing error context in csv --- milli/src/documents/builder.rs | 11 +++++++---- milli/src/documents/mod.rs | 14 ++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 144dcdfa9..c2b0e01cc 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -115,14 +115,17 @@ impl DocumentBatchBuilder { .map(|(k, t)| (this.index.insert(&k), t)) .collect::>(); - let records = records.into_records(); - - for record in records { + for (i, record) in records.into_records().enumerate() { let record = record?; let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { - AllowedType::Number => value.parse::().map(Value::from)?, + AllowedType::Number => value.parse::().map(Value::from).map_err(|error| Error::ParseFloat { + error, + // +1 for the header offset. + line: i + 1, + value: value.to_string(), + })?, AllowedType::String => Value::String(value.to_string()), }; diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index b61a3326b..6f653d825 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -83,7 +83,11 @@ impl io::Write for ByteCounter { #[derive(Debug)] pub enum Error { - ParseFloat(std::num::ParseFloatError), + ParseFloat { + error: std::num::ParseFloatError, + line: usize, + value: String, + }, InvalidDocumentFormat, Custom(String), JsonError(serde_json::Error), @@ -117,16 +121,10 @@ impl From for Error { } } -impl From for Error { - fn from(other: ParseFloatError) -> Self { - Self::ParseFloat(other) - } -} - impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Error::ParseFloat(e) => write!(f, "{}", e), + Error::ParseFloat { error, line, value} => write!(f, "Error parsing number {:?} at line {}: {}", value, line, error), Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), From baddd80069b26d66421ce68a7a2e004e148646b0 Mon Sep 17 00:00:00 2001 From: marin postma Date: Mon, 25 Oct 2021 17:38:32 +0200 Subject: [PATCH 11/22] implement review suggestions --- benchmarks/benches/utils.rs | 32 +- cli/src/main.rs | 28 +- http-ui/src/documents_from_csv.rs | 285 ------------------ http-ui/src/main.rs | 28 +- milli/src/documents/builder.rs | 54 ++-- milli/src/documents/mod.rs | 17 +- milli/src/documents/serde.rs | 39 +-- milli/src/update/index_documents/mod.rs | 3 +- milli/src/update/index_documents/transform.rs | 3 +- 9 files changed, 89 insertions(+), 400 deletions(-) delete mode 100644 http-ui/src/documents_from_csv.rs diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 24f5d5343..dbe8fffad 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,7 +1,7 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, Cursor, Read, Seek}; +use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; @@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader anyhow::Result> { +fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; Ok(writer.into_inner()) } -fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { +fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) } -fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { +fn documents_from_csv(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let iter = CSVDocumentDeserializer::from_reader(reader)?; - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/cli/src/main.rs b/cli/src/main.rs index b84ff3243..8e28d4a25 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,5 +1,5 @@ use std::fs::File; -use std::io::{stdin, Cursor, Read}; +use std::io::{stdin, BufRead, BufReader, Cursor, Read}; use std::path::PathBuf; use std::str::FromStr; @@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; -use serde_json::{Map, Value}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; @@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) @@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result> { fn documents_from_csv(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let mut records = csv::Reader::from_reader(reader); - let iter = records.deserialize::>(); - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/http-ui/src/documents_from_csv.rs b/http-ui/src/documents_from_csv.rs deleted file mode 100644 index 2b62f23c2..000000000 --- a/http-ui/src/documents_from_csv.rs +++ /dev/null @@ -1,285 +0,0 @@ -use std::io::{Read, Result as IoResult}; -use std::num::ParseFloatError; - -use serde_json::{Map, Value}; - -enum AllowedType { - String, - Number, -} - -fn parse_csv_header(header: &str) -> (String, AllowedType) { - // if there are several separators we only split on the last one. - match header.rsplit_once(':') { - Some((field_name, field_type)) => match field_type { - "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), - // we may return an error in this case. - _otherwise => (header.to_string(), AllowedType::String), - }, - None => (header.to_string(), AllowedType::String), - } -} - -pub struct CSVDocumentDeserializer -where - R: Read, -{ - documents: csv::StringRecordsIntoIter, - headers: Vec<(String, AllowedType)>, -} - -impl CSVDocumentDeserializer { - pub fn from_reader(reader: R) -> IoResult { - let mut records = csv::Reader::from_reader(reader); - - let headers = records.headers()?.into_iter().map(parse_csv_header).collect(); - - Ok(Self { documents: records.into_records(), headers }) - } -} - -impl Iterator for CSVDocumentDeserializer { - type Item = anyhow::Result>; - - fn next(&mut self) -> Option { - let csv_document = self.documents.next()?; - - match csv_document { - Ok(csv_document) => { - let mut document = Map::new(); - - for ((field_name, field_type), value) in - self.headers.iter().zip(csv_document.into_iter()) - { - let parsed_value: Result = match field_type { - AllowedType::Number => { - value.parse::().map(Value::from).map_err(Into::into) - } - AllowedType::String => Ok(Value::String(value.to_string())), - }; - - match parsed_value { - Ok(value) => drop(document.insert(field_name.to_string(), value)), - Err(_e) => { - return Some(Err(anyhow::anyhow!( - "Value '{}' is not a valid number", - value - ))) - } - } - } - - Some(Ok(document)) - } - Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))), - } - } -} - -#[cfg(test)] -mod test { - use serde_json::json; - - use super::*; - - #[test] - fn simple_csv_document() { - let documents = r#"city,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn coma_in_field() { - let documents = r#"city,country,pop -"Boston","United, States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United, States", - "pop": "4628910", - }) - ); - } - - #[test] - fn quote_in_field() { - let documents = r#"city,country,pop -"Boston","United"" States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United\" States", - "pop": "4628910", - }) - ); - } - - #[test] - fn integer_in_field() { - let documents = r#"city,country,pop:number -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": 4628910.0, - }) - ); - } - - #[test] - fn float_in_field() { - let documents = r#"city,country,pop:number -"Boston","United States","4628910.01""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": 4628910.01, - }) - ); - } - - #[test] - fn several_double_dot_in_header() { - let documents = r#"city:love:string,country:state,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:love": "Boston", - "country:state": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn ending_by_double_dot_in_header() { - let documents = r#"city:,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn starting_by_double_dot_in_header() { - let documents = r#":city,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - ":city": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn starting_by_double_dot_in_header2() { - let documents = r#":string,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn double_double_dot_in_header() { - let documents = r#"city::string,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn bad_type_in_header() { - let documents = r#"city,country:number,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } - - #[test] - fn bad_column_count1() { - let documents = r#"city,country,pop -"Boston","United States","4628910", "too much""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } - - #[test] - fn bad_column_count2() { - let documents = r#"city,country,pop -"Boston","United States""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } -} diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index d27c6d5bb..9e9fe4a2b 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,10 +1,9 @@ -mod documents_from_csv; mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; -use std::io::Cursor; +use std::io::{BufRead, BufReader, Cursor}; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -39,7 +38,6 @@ use warp::http::Response; use warp::Filter; use self::update_store::UpdateStore; -use crate::documents_from_csv::CSVDocumentDeserializer; #[cfg(target_os = "linux")] #[global_allocator] @@ -1041,11 +1039,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; @@ -1056,8 +1054,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) @@ -1065,16 +1062,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let iter = CSVDocumentDeserializer::from_reader(reader)?; - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index c2b0e01cc..6ba890b79 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,16 +1,14 @@ use std::collections::BTreeMap; use std::io; -use std::io::Cursor; -use std::io::Write; +use std::io::{Cursor, Write}; use byteorder::{BigEndian, WriteBytesExt}; use serde::Deserializer; use serde_json::Value; -use crate::FieldId; - use super::serde::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; +use crate::FieldId; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. @@ -27,7 +25,7 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// let json = r##"{"id": 1, "name": "foo"}"##; /// let mut writer = Cursor::new(Vec::new()); /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.extend_from_json(Cursor::new(json.as_bytes())).unwrap(); +/// builder.extend_from_json(&mut json.as_bytes()).unwrap(); /// builder.finish().unwrap(); /// ``` pub struct DocumentBatchBuilder { @@ -46,16 +44,14 @@ impl DocumentBatchBuilder { // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; - let this = Self { + Ok(Self { inner: writer, index, obkv_buffer: Vec::new(), value_buffer: Vec::new(), values: BTreeMap::new(), count: 0, - }; - - Ok(this) + }) } /// Returns the number of documents that have been written to the builder. @@ -117,27 +113,31 @@ impl DocumentBatchBuilder { for (i, record) in records.into_records().enumerate() { let record = record?; - let mut writer = obkv::KvWriter::new(Cursor::new(&mut this.obkv_buffer)); + this.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { let value = match ty { - AllowedType::Number => value.parse::().map(Value::from).map_err(|error| Error::ParseFloat { - error, - // +1 for the header offset. - line: i + 1, - value: value.to_string(), - })?, + AllowedType::Number => { + value.parse::().map(Value::from).map_err(|error| { + Error::ParseFloat { + error, + // +1 for the header offset. + line: i + 1, + value: value.to_string(), + } + })? + } AllowedType::String => Value::String(value.to_string()), }; + this.value_buffer.clear(); serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; writer.insert(*fid, &this.value_buffer)?; - this.value_buffer.clear(); } this.inner.write_u32::(this.obkv_buffer.len() as u32)?; this.inner.write_all(&this.obkv_buffer)?; - this.obkv_buffer.clear(); this.count += 1; } @@ -156,7 +156,8 @@ fn parse_csv_header(header: &str) -> (String, AllowedType) { match header.rsplit_once(':') { Some((field_name, field_type)) => match field_type { "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. + "number" => (field_name.to_string(), AllowedType::Number), + // if the pattern isn't reconized, we keep the whole field. _otherwise => (header.to_string(), AllowedType::String), }, None => (header.to_string(), AllowedType::String), @@ -169,9 +170,8 @@ mod test { use serde_json::{json, Map}; - use crate::documents::DocumentBatchReader; - use super::*; + use crate::documents::DocumentBatchReader; fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { let mut map = Map::new(); @@ -525,7 +525,9 @@ mod test { "Boston","United States","4628910""#; let mut buf = Vec::new(); - assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); } #[test] @@ -534,7 +536,9 @@ mod test { "Boston","United States","4628910", "too much""#; let mut buf = Vec::new(); - assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); } #[test] @@ -543,6 +547,8 @@ mod test { "Boston","United States""#; let mut buf = Vec::new(); - assert!(DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err()); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 6f653d825..14d97ee7d 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,9 +7,8 @@ mod builder; mod reader; mod serde; -use std::num::ParseFloatError; -use std::io; use std::fmt::{self, Debug}; +use std::io; use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; @@ -24,7 +23,7 @@ pub struct DocumentsBatchIndex(pub BiHashMap); impl DocumentsBatchIndex { /// Insert the field in the map, or return it's field id if it doesn't already exists. - pub fn insert(&mut self, field: &str) -> FieldId { + pub fn insert(&mut self, field: &str) -> FieldId { match self.0.get_by_right(field) { Some(field_id) => *field_id, None => { @@ -43,7 +42,7 @@ impl DocumentsBatchIndex { self.0.len() } - pub fn iter(&self) -> impl Iterator { + pub fn iter(&self) -> bimap::hash::Iter { self.0.iter() } @@ -83,11 +82,7 @@ impl io::Write for ByteCounter { #[derive(Debug)] pub enum Error { - ParseFloat { - error: std::num::ParseFloatError, - line: usize, - value: String, - }, + ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, Custom(String), JsonError(serde_json::Error), @@ -124,7 +119,9 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { - Error::ParseFloat { error, line, value} => write!(f, "Error parsing number {:?} at line {}: {}", value, line, error), + Error::ParseFloat { error, line, value } => { + write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) + } Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 4dfdca2c7..d57bf1ffb 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,18 +1,13 @@ use std::collections::BTreeMap; -use std::io::Cursor; -use std::io::Write; use std::fmt; +use std::io::{Cursor, Write}; use byteorder::WriteBytesExt; +use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; use serde::Deserialize; -use serde::de::DeserializeSeed; -use serde::de::MapAccess; -use serde::de::SeqAccess; -use serde::de::Visitor; use serde_json::Value; -use super::Error; -use super::{ByteCounter, DocumentsBatchIndex}; +use super::{ByteCounter, DocumentsBatchIndex, Error}; use crate::FieldId; macro_rules! tri { @@ -31,8 +26,9 @@ impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { fn deserialize(self, deserializer: D) -> Result where - D: serde::Deserializer<'de> { - deserializer.deserialize_str(self) + D: serde::Deserializer<'de>, + { + deserializer.deserialize_str(self) } } @@ -43,7 +39,7 @@ impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { where E: serde::de::Error, { - Ok(self.0.insert(v)) + Ok(self.0.insert(v)) } fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { @@ -58,8 +54,9 @@ impl<'de> DeserializeSeed<'de> for ValueDeserializer { fn deserialize(self, deserializer: D) -> Result where - D: serde::Deserializer<'de> { - serde_json::Value::deserialize(deserializer) + D: serde::Deserializer<'de>, + { + serde_json::Value::deserialize(deserializer) } } @@ -80,7 +77,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { where A: SeqAccess<'de>, { - while let Some(v) = seq.next_element_seed(&mut *self)? { tri!(v) } + while let Some(v) = seq.next_element_seed(&mut *self)? { + tri!(v) + } Ok(Ok(())) } @@ -89,7 +88,9 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { where A: MapAccess<'de>, { - while let Some((key, value)) = map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? { + while let Some((key, value)) = + map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? + { self.values.insert(key, value); } @@ -119,13 +120,15 @@ impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { } impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> -where W: Write, +where + W: Write, { type Value = Result<(), Error>; fn deserialize(self, deserializer: D) -> Result where - D: serde::Deserializer<'de> { - deserializer.deserialize_map(self) + D: serde::Deserializer<'de>, + { + deserializer.deserialize_map(self) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 17c778060..440546b10 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -906,8 +906,9 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); + let big_object = serde_json::to_string(&big_object).unwrap(); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(big_object).unwrap(); + builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index eac60effb..08aa72d35 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -544,7 +544,8 @@ mod test { mod primary_key_inference { use bimap::BiHashMap; - use crate::{documents::DocumentsBatchIndex, update::index_documents::transform::find_primary_key}; + use crate::documents::DocumentsBatchIndex; + use crate::update::index_documents::transform::find_primary_key; #[test] fn primary_key_infered_on_first_field() { From 9b8ab40d80c0fdf6cfeb95270ebd7bdbb4cf386a Mon Sep 17 00:00:00 2001 From: marin postma Date: Tue, 26 Oct 2021 11:35:49 +0200 Subject: [PATCH 12/22] remove search folder --- search/Cargo.toml | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 search/Cargo.toml diff --git a/search/Cargo.toml b/search/Cargo.toml deleted file mode 100644 index d9441984e..000000000 --- a/search/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "search" -version = "0.19.0" -authors = ["Clément Renault "] -edition = "2018" - -[dependencies] -anyhow = "1.0.38" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } -log = "0.4.14" -milli = { path = "../milli" } -serde_json = "1.0.62" -stderrlog = "0.5.1" -structopt = { version = "0.3.21", default-features = false } - -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2" From 3599df77f05f4e89a28c0160411e95a840e0b227 Mon Sep 17 00:00:00 2001 From: many Date: Tue, 26 Oct 2021 17:49:35 +0200 Subject: [PATCH 13/22] Change some error messages --- milli/src/asc_desc.rs | 12 +- milli/src/criterion.rs | 14 ++- milli/src/error.rs | 112 ++++++++++++------ milli/src/search/facet/filter_condition.rs | 104 ++++++---------- milli/src/update/index_documents/transform.rs | 6 +- milli/src/update/settings.rs | 8 +- 6 files changed, 138 insertions(+), 118 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 00f65a459..c0a277c0c 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -28,12 +28,12 @@ impl fmt::Display for AscDescError { write!(f, "Longitude must be contained between -180 and 180 degrees.",) } Self::InvalidSyntax { name } => { - write!(f, "invalid asc/desc syntax for {}.", name) + write!(f, "invalid asc/desc syntax for `{}`.", name) } Self::ReservedKeyword { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a asc/desc rule.", + "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.", name ) } @@ -192,18 +192,18 @@ impl fmt::Display for SortError { Self::BadGeoPointUsage { name } => { write!( f, - "invalid syntax for the `_geoPoint` parameter: `{}`. \ + "Invalid syntax for the `_geoPoint` parameter: `{}`. \ Usage: `_geoPoint(latitude, longitude):asc`.", name ) } Self::InvalidName { name } => { - write!(f, "invalid syntax for the sort parameter `{}`.", name) + write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) } Self::ReservedName { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a sort expression.", + "`{}` is a reserved keyword and thus can't be used as a sort expression.", name ) } @@ -211,7 +211,7 @@ impl fmt::Display for SortError { write!( f, "`{}` is a reserved keyword and thus can't be used as a sort expression. \ - Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.", + Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.", name, ) } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index aff7fcf68..0586fcc0f 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -19,21 +19,25 @@ impl fmt::Display for CriterionError { match self { Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name), Self::ReservedName { name } => { - write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a ranking rule", + name + ) } Self::ReservedNameForSort { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a ranking rule. \ -{} can only be used for sorting at search time", + "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{}` can only be used for sorting at search time", name, name ) } Self::ReservedNameForFilter { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a ranking rule. \ -{} can only be used for filtering at search time", + "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{}` can only be used for filtering at search time", name, name ) } diff --git a/milli/src/error.rs b/milli/src/error.rs index 1f1cc5264..a4125d117 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -59,23 +59,28 @@ pub enum UserError { DocumentLimitReached, InvalidDocumentId { document_id: Value }, InvalidFacetsDistribution { invalid_facets_name: HashSet }, - InvalidFilter(pest::error::Error), - InvalidFilterAttribute(pest::error::Error), + InvalidFilter(FilterError), InvalidGeoField { document_id: Value, object: Value }, InvalidSortableAttribute { field: String, valid_fields: HashSet }, SortRankingRuleMissing, InvalidStoreFile, MaxDatabaseSizeReached, - MissingDocumentId { document: Object }, + MissingDocumentId { primary_key: String, document: Object }, MissingPrimaryKey, NoSpaceLeftOnDevice, - PrimaryKeyCannotBeChanged, - PrimaryKeyCannotBeReset, + PrimaryKeyCannotBeChanged(String), SerdeJson(serde_json::Error), SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, } +#[derive(Debug)] +pub enum FilterError { + InvalidAttribute { field: String, valid_fields: HashSet }, + ReservedKeyword { field: String, context: Option }, + Syntax(pest::error::Error), +} + impl From for Error { fn from(error: io::Error) -> Error { // TODO must be improved and more precise @@ -160,6 +165,12 @@ impl From for Error { } } +impl From for Error { + fn from(error: FilterError) -> Error { + Error::UserError(UserError::InvalidFilter(error)) + } +} + impl From for Error { fn from(error: SerializationError) -> Error { Error::InternalError(InternalError::Serialization(error)) @@ -169,7 +180,7 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InternalError(error) => write!(f, "internal: {}", error), + Self::InternalError(error) => write!(f, "internal: {}.", error), Self::IoError(error) => error.fmt(f), Self::UserError(error) => error.fmt(f), } @@ -182,15 +193,15 @@ impl fmt::Display for InternalError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::DatabaseMissingEntry { db_name, key } => { - write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) + write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) } Self::FieldIdMapMissingEntry(error) => error.fmt(f), Self::Fst(error) => error.fmt(f), Self::GrenadInvalidCompressionType => { - f.write_str("invalid compression type have been specified to grenad") + f.write_str("Invalid compression type have been specified to grenad.") } Self::IndexingMergingKeys { process } => { - write!(f, "invalid merge while processing {}", process) + write!(f, "Invalid merge while processing {}.", process) } Self::Serialization(error) => error.fmt(f), Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), @@ -208,67 +219,100 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), + Self::AttributeLimitReached => f.write_str("Maximum number of attributes reached."), Self::CriterionError(error) => write!(f, "{}", error), - Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), + Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."), Self::InvalidFacetsDistribution { invalid_facets_name } => { let name_list = invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", "); write!( f, - "invalid facet distribution, the fields {} are not set as filterable", + "Invalid facet distribution, the fields `{}` are not set as filterable.", name_list ) } Self::InvalidFilter(error) => error.fmt(f), Self::InvalidGeoField { document_id, object } => write!( f, - "the document with the id: {} contains an invalid _geo field: {}", + "The document with the id: `{}` contains an invalid _geo field: `{}`.", document_id, object ), Self::InvalidDocumentId { document_id } => { - let json = serde_json::to_string(document_id).unwrap(); + let document_id = match document_id { + Value::String(id) => id.clone(), + _ => document_id.to_string(), + }; write!( f, - "document identifier is invalid {}, \ -a document id can be of type integer or string \ -only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)", - json + "Document identifier `{}` is invalid. \ +A document identifier can be of type integer or string, \ +only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", + document_id ) } - Self::InvalidFilterAttribute(error) => error.fmt(f), Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); write!( f, - "Attribute {} is not sortable, available sortable attributes are: {}", + "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.", field, valid_names ) } Self::SortRankingRuleMissing => f.write_str( - "You must specify where \"sort\" is listed in the \ -rankingRules setting to use the sort parameter at search time", + "The sort ranking rule must be specified in the \ +ranking rules settings to use the sort parameter at search time.", ), - Self::MissingDocumentId { document } => { + Self::MissingDocumentId { primary_key, document } => { let json = serde_json::to_string(document).unwrap(); - write!(f, "document doesn't have an identifier {}", json) + write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) } - Self::MissingPrimaryKey => f.write_str("missing primary key"), - Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), + Self::MissingPrimaryKey => f.write_str("Missing primary key."), + Self::MaxDatabaseSizeReached => f.write_str("Maximum database size reached."), // TODO where can we find it instead of writing the text ourselves? - Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), - Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), - Self::PrimaryKeyCannotBeChanged => { - f.write_str("primary key cannot be changed if the database contains documents") - } - Self::PrimaryKeyCannotBeReset => { - f.write_str("primary key cannot be reset if the database contains documents") + Self::NoSpaceLeftOnDevice => f.write_str("No space left on device."), + Self::InvalidStoreFile => f.write_str("Store file is not a valid database file."), + Self::PrimaryKeyCannotBeChanged(primary_key) => { + write!(f, "Index already has a primary key: `{}`.", primary_key) } Self::SerdeJson(error) => error.fmt(f), Self::SortError(error) => write!(f, "{}", error), Self::UnknownInternalDocumentId { document_id } => { - write!(f, "an unknown internal document id have been used ({})", document_id) + write!(f, "An unknown internal document id have been used: `{}`.", document_id) + } + } + } +} + +impl fmt::Display for FilterError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + Self::InvalidAttribute { field, valid_fields } => write!( + f, + "Attribute `{}` is not filterable. Available filterable attributes are: `{}`.", + field, + valid_fields + .clone() + .into_iter() + .reduce(|left, right| left + "`, `" + &right) + .unwrap_or_default() + ), + Self::ReservedKeyword { field, context: Some(context) } => { + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a filter expression. {}", + field, context + ) + } + Self::ReservedKeyword { field, context: None } => { + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a filter expression.", + field + ) + } + Self::Syntax(syntax_helper) => { + write!(f, "Invalid syntax for the filter parameter: `{}`.", syntax_helper) } } } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f1055b2f8..3378054d4 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -6,7 +6,6 @@ use std::str::FromStr; use either::Either; use heed::types::DecodeIgnore; -use itertools::Itertools; use log::debug; use pest::error::{Error as PestError, ErrorVariant}; use pest::iterators::{Pair, Pairs}; @@ -17,7 +16,7 @@ use self::FilterCondition::*; use self::Operator::*; use super::parser::{FilterParser, Rule, PREC_CLIMBER}; use super::FacetNumberRange; -use crate::error::UserError; +use crate::error::FilterError; use crate::heed_codec::facet::{ FacetLevelValueF64Codec, FacetStringLevelZeroCodec, FacetStringLevelZeroValueCodec, }; @@ -117,8 +116,7 @@ impl FilterCondition { ) -> Result { let fields_ids_map = index.fields_ids_map(rtxn)?; let filterable_fields = index.filterable_fields(rtxn)?; - let lexed = - FilterParser::parse(Rule::prgm, expression).map_err(UserError::InvalidFilter)?; + let lexed = FilterParser::parse(Rule::prgm, expression).map_err(FilterError::Syntax)?; FilterCondition::from_pairs(&fields_ids_map, &filterable_fields, lexed) } @@ -169,15 +167,11 @@ impl FilterCondition { item: Pair, ) -> Result { if !filterable_fields.contains("_geo") { - return Err(UserError::InvalidFilterAttribute(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `_geo` is not filterable, available filterable attributes are: {}", - filterable_fields.iter().join(", "), - ), - }, - item.as_span(), - )))?; + return Err(FilterError::InvalidAttribute { + field: "_geo".to_string(), + valid_fields: filterable_fields.clone(), + } + .into()); } let mut items = item.into_inner(); let fid = match fields_ids_map.id("_geo") { @@ -194,27 +188,27 @@ impl FilterCondition { .map(|param| (param.clone(), param.as_span())) .map(|(param, span)| pest_parse(param).0.map(|arg| (arg, span))) .collect::, _>>() - .map_err(UserError::InvalidFilter)?; + .map_err(FilterError::Syntax)?; if parameters.len() != 3 { - return Err(UserError::InvalidFilter(PestError::new_from_span( + return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), }, // we want to point to the last parameters and if there was no parameters we // point to the parenthesis parameters.last().map(|param| param.1.clone()).unwrap_or(param_span), - )))?; + )).into()); } let (lat, lng, distance) = (¶meters[0], ¶meters[1], parameters[2].0); if !(-90.0..=90.0).contains(&lat.0) { - return Err(UserError::InvalidFilter(PestError::new_from_span( + return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { message: format!("Latitude must be contained between -90 and 90 degrees."), }, lat.1.clone(), )))?; } else if !(-180.0..=180.0).contains(&lng.0) { - return Err(UserError::InvalidFilter(PestError::new_from_span( + return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { message: format!("Longitude must be contained between -180 and 180 degrees."), }, @@ -230,9 +224,7 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; @@ -240,8 +232,8 @@ impl FilterCondition { let (lresult, _) = pest_parse(items.next().unwrap()); let (rresult, _) = pest_parse(items.next().unwrap()); - let lvalue = lresult.map_err(UserError::InvalidFilter)?; - let rvalue = rresult.map_err(UserError::InvalidFilter)?; + let lvalue = lresult.map_err(FilterError::Syntax)?; + let rvalue = rresult.map_err(FilterError::Syntax)?; Ok(Operator(fid, Between(lvalue, rvalue))) } @@ -252,9 +244,7 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; @@ -272,16 +262,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, GreaterThan(value))) } @@ -292,16 +280,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, GreaterThanOrEqual(value))) } @@ -312,16 +298,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, LowerThan(value))) } @@ -332,16 +316,14 @@ impl FilterCondition { item: Pair, ) -> Result { let mut items = item.into_inner(); - let fid = match field_id(fields_ids_map, filterable_fields, &mut items) - .map_err(UserError::InvalidFilterAttribute)? - { + let fid = match field_id(fields_ids_map, filterable_fields, &mut items)? { Some(fid) => fid, None => return Ok(Empty), }; let value = items.next().unwrap(); let (result, _svalue) = pest_parse(value); - let value = result.map_err(UserError::InvalidFilter)?; + let value = result.map_err(FilterError::Syntax)?; Ok(Operator(fid, LowerThanOrEqual(value))) } @@ -598,43 +580,27 @@ fn field_id( fields_ids_map: &FieldsIdsMap, filterable_fields: &HashSet, items: &mut Pairs, -) -> StdResult, PestError> { +) -> StdResult, FilterError> { // lexing ensures that we at least have a key let key = items.next().unwrap(); if key.as_rule() == Rule::reserved { - let message = match key.as_str() { + return match key.as_str() { key if key.starts_with("_geoPoint") => { - format!( - "`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. \ - Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", - ) + Err(FilterError::ReservedKeyword { field: "_geoPoint".to_string(), context: Some("Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.".to_string()) }) } - key @ "_geo" => { - format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression. \ - Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates.", - key - ) + "_geo" => { + Err(FilterError::ReservedKeyword { field: "_geo".to_string(), context: Some("Use the _geoRadius(latitude, longitude, distance) built-in rule to filter on _geo field coordinates.".to_string()) }) } - key => format!( - "`{}` is a reserved keyword and thus can't be used as a filter expression.", - key - ), + key => + Err(FilterError::ReservedKeyword { field: key.to_string(), context: None }), }; - return Err(PestError::new_from_span(ErrorVariant::CustomError { message }, key.as_span())); } if !filterable_fields.contains(key.as_str()) { - return Err(PestError::new_from_span( - ErrorVariant::CustomError { - message: format!( - "attribute `{}` is not filterable, available filterable attributes are: {}.", - key.as_str(), - filterable_fields.iter().join(", "), - ), - }, - key.as_span(), - )); + return Err(FilterError::InvalidAttribute { + field: key.as_str().to_string(), + valid_fields: filterable_fields.clone(), + }); } Ok(fields_ids_map.id(key.as_str())) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 08aa72d35..855fb8db9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -187,7 +187,11 @@ impl Transform<'_, '_> { } } - return Err(UserError::MissingDocumentId { document: json }.into()); + return Err(UserError::MissingDocumentId { + primary_key: primary_key_name, + document: json, + } + .into()); } let uuid = diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index dee63c726..94875a079 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_primary_key(self.wtxn, primary_key)?; Ok(()) } else { - Err(UserError::PrimaryKeyCannotBeChanged.into()) + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) } } Setting::Reset => { @@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.delete_primary_key(self.wtxn)?; Ok(()) } else { - Err(UserError::PrimaryKeyCannotBeReset.into()) + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) } } Setting::NotSet => Ok(()), @@ -1105,7 +1107,7 @@ mod tests { builder.reset_primary_key(); let err = builder.execute(|_, _| ()).unwrap_err(); - assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); + assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); wtxn.abort().unwrap(); // But if we clear the database... From 2be755ce75d8b1e64c11bf1801a1f3474beb3362 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 27 Oct 2021 19:50:41 +0200 Subject: [PATCH 14/22] Lower error check, already check in meilisearch --- milli/src/asc_desc.rs | 46 ---------------------- milli/src/search/facet/filter_condition.rs | 43 ++------------------ 2 files changed, 4 insertions(+), 85 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index c0a277c0c..8d4973c2f 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -318,50 +318,4 @@ mod tests { ); } } - - #[test] - fn sort_error_message() { - let errors = [ - ( - AscDescError::InvalidSyntax { name: S("truc:machin") }, - S("invalid syntax for the sort parameter `truc:machin`."), - ), - ( - AscDescError::InvalidSyntax { name: S("hello:world") }, - S("invalid syntax for the sort parameter `hello:world`."), - ), - ( - AscDescError::ReservedKeyword { name: S("_geo") }, - S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), - ), - ( - AscDescError::ReservedKeyword { name: S("_geoDistance") }, - S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.") - ), - ( - AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") }, - S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), - ), - ( - AscDescError::InvalidLatitude, - S("Latitude must be contained between -90 and 90 degrees."), - ), - ( - AscDescError::InvalidLongitude, - S("Longitude must be contained between -180 and 180 degrees."), - ), - ]; - - for (asc_desc_error, expected_message) in errors { - let sort_error = SortError::from(asc_desc_error); - assert_eq!( - sort_error.to_string(), - expected_message, - "was expecting {} for the error {:?} but instead got {}", - expected_message, - sort_error, - sort_error.to_string() - ); - } - } } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index 3378054d4..f8d40aefb 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -755,39 +755,13 @@ mod tests { let index = Index::new(options, &path).unwrap(); let rtxn = index.read_txn().unwrap(); - let error = FilterCondition::from_str(&rtxn, &index, "_geo = 12").unwrap_err(); - assert!(error - .to_string() - .contains("`_geo` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, "_geo = 12").is_err()); - let error = - FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoDistance` is a reserved keyword and thus can't be used as a filter expression."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoDistance <= 1000"#).is_err()); - let error = FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoPoint > 5"#).is_err()); - let error = - FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).unwrap_err(); - assert!(error - .to_string() - .contains("`_geoPoint` is a reserved keyword and thus can't be used as a filter expression. Use the `_geoRadius(latitude, longitude, distance)` built-in rule to filter on `_geo` field coordinates."), - "{}", - error.to_string() - ); + assert!(FilterCondition::from_str(&rtxn, &index, r#"_geoPoint(12, 16) > 5"#).is_err()); } #[test] @@ -804,15 +778,6 @@ mod tests { builder.execute(|_, _| ()).unwrap(); wtxn.commit().unwrap(); - let rtxn = index.read_txn().unwrap(); - // _geo is not filterable - let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(12, 12, 10)"); - assert!(result.is_err()); - let error = result.unwrap_err(); - assert!(error - .to_string() - .contains("attribute `_geo` is not filterable, available filterable attributes are:"),); - let mut wtxn = index.write_txn().unwrap(); let mut builder = Settings::new(&mut wtxn, &index, 0); builder.set_filterable_fields(hashset! { S("_geo"), S("price") }); From 183d3dada7d50c2bc8a129805e8dfae4bb3255dd Mon Sep 17 00:00:00 2001 From: marin postma Date: Thu, 28 Oct 2021 10:33:04 +0200 Subject: [PATCH 15/22] return document count from builder --- milli/src/documents/builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 6ba890b79..f95fa9190 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -62,7 +62,7 @@ impl DocumentBatchBuilder { /// This method must be called after the document addition is terminated. It will put the /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. - pub fn finish(self) -> Result<(), Error> { + pub fn finish(self) -> Result { let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; let meta = DocumentsMetadata { count, index }; @@ -74,7 +74,7 @@ impl DocumentBatchBuilder { writer.flush()?; - Ok(()) + Ok(count) } /// Extends the builder with json documents from a reader. From ed6db196810f78632758fc386f8a7f5f6cd6f357 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 28 Oct 2021 11:18:32 +0200 Subject: [PATCH 16/22] Fix PR comments --- milli/src/error.rs | 8 ++++---- milli/src/search/facet/filter_condition.rs | 22 +++++++++++++++------- milli/src/search/mod.rs | 4 ++-- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a4125d117..be8458ce6 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::convert::Infallible; use std::error::Error as StdError; use std::{fmt, io, str}; @@ -58,10 +58,10 @@ pub enum UserError { CriterionError(CriterionError), DocumentLimitReached, InvalidDocumentId { document_id: Value }, - InvalidFacetsDistribution { invalid_facets_name: HashSet }, + InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, InvalidFilter(FilterError), InvalidGeoField { document_id: Value, object: Value }, - InvalidSortableAttribute { field: String, valid_fields: HashSet }, + InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, SortRankingRuleMissing, InvalidStoreFile, MaxDatabaseSizeReached, @@ -76,7 +76,7 @@ pub enum UserError { #[derive(Debug)] pub enum FilterError { - InvalidAttribute { field: String, valid_fields: HashSet }, + InvalidAttribute { field: String, valid_fields: BTreeSet }, ReservedKeyword { field: String, context: Option }, Syntax(pest::error::Error), } diff --git a/milli/src/search/facet/filter_condition.rs b/milli/src/search/facet/filter_condition.rs index f8d40aefb..cd7bcdc4f 100644 --- a/milli/src/search/facet/filter_condition.rs +++ b/milli/src/search/facet/filter_condition.rs @@ -169,7 +169,7 @@ impl FilterCondition { if !filterable_fields.contains("_geo") { return Err(FilterError::InvalidAttribute { field: "_geo".to_string(), - valid_fields: filterable_fields.clone(), + valid_fields: filterable_fields.into_iter().cloned().collect(), } .into()); } @@ -192,7 +192,7 @@ impl FilterCondition { if parameters.len() != 3 { return Err(FilterError::Syntax(PestError::new_from_span( ErrorVariant::CustomError { - message: format!("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`"), + message: format!("The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)"), }, // we want to point to the last parameters and if there was no parameters we // point to the parenthesis @@ -599,7 +599,7 @@ fn field_id( if !filterable_fields.contains(key.as_str()) { return Err(FilterError::InvalidAttribute { field: key.as_str().to_string(), - valid_fields: filterable_fields.clone(), + valid_fields: filterable_fields.into_iter().cloned().collect(), }); } @@ -829,26 +829,34 @@ mod tests { let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius don't have any parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius()"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius don't have enough parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2)"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius have too many parameters let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)"); assert!(result.is_err()); let error = result.unwrap_err(); - assert!(error.to_string().contains("The `_geoRadius` filter expect three arguments: `_geoRadius(latitude, longitude, radius)`")); + assert!(error.to_string().contains( + "The _geoRadius filter expect three arguments: _geoRadius(latitude, longitude, radius)" + )); // georadius have a bad latitude let result = FilterCondition::from_str(&rtxn, &index, "_geoRadius(-100, 150, 10)"); diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index bec059d46..aa2544091 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -151,13 +151,13 @@ impl<'a> Search<'a> { Member::Field(ref field) if !sortable_fields.contains(field) => { return Err(UserError::InvalidSortableAttribute { field: field.to_string(), - valid_fields: sortable_fields, + valid_fields: sortable_fields.into_iter().collect(), })? } Member::Geo(_) if !sortable_fields.contains("_geo") => { return Err(UserError::InvalidSortableAttribute { field: "_geo".to_string(), - valid_fields: sortable_fields, + valid_fields: sortable_fields.into_iter().collect(), })? } _ => (), From 9f1e0d2a49447f106277b8a07e0bba65370b47c8 Mon Sep 17 00:00:00 2001 From: many Date: Thu, 28 Oct 2021 14:47:17 +0200 Subject: [PATCH 17/22] Refine asc/desc error messages --- milli/src/asc_desc.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 8d4973c2f..f07e1ded8 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -28,7 +28,7 @@ impl fmt::Display for AscDescError { write!(f, "Longitude must be contained between -180 and 180 degrees.",) } Self::InvalidSyntax { name } => { - write!(f, "invalid asc/desc syntax for `{}`.", name) + write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) } Self::ReservedKeyword { name } => { write!( @@ -192,8 +192,8 @@ impl fmt::Display for SortError { Self::BadGeoPointUsage { name } => { write!( f, - "Invalid syntax for the `_geoPoint` parameter: `{}`. \ - Usage: `_geoPoint(latitude, longitude):asc`.", + "Invalid syntax for the geo parameter: expected expression formated like \ + `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.", name ) } From 056ff13c4d981a37fce66b6f1c7eee6a375920d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9mentine=20Urquizar?= Date: Thu, 28 Oct 2021 14:52:57 +0200 Subject: [PATCH 18/22] Update version for the next release (v0.20.0) --- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 341b8eb7c..2e71dfb21 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.19.0" +version = "0.20.0" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 310388e01..824771ce7 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.19.0" +version = "0.20.0" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 726fa9c5f..461c22de0 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.19.0" +version = "0.20.0" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 209c8b1f7..375cbf0dc 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.19.0" +version = "0.20.0" authors = ["Kerollmops "] edition = "2018" From 0c0038488c4881c4b5dd8d1c8e8ba59ea7d5737c Mon Sep 17 00:00:00 2001 From: many Date: Wed, 3 Nov 2021 11:24:06 +0100 Subject: [PATCH 19/22] Change last error messages --- milli/src/error.rs | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index be8458ce6..59744a32e 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -232,11 +232,21 @@ impl fmt::Display for UserError { ) } Self::InvalidFilter(error) => error.fmt(f), - Self::InvalidGeoField { document_id, object } => write!( - f, - "The document with the id: `{}` contains an invalid _geo field: `{}`.", - document_id, object - ), + Self::InvalidGeoField { document_id, object } => { + let document_id = match document_id { + Value::String(id) => id.clone(), + _ => document_id.to_string(), + }; + let object = match object { + Value::String(id) => id.clone(), + _ => object.to_string(), + }; + write!( + f, + "The document with the id: `{}` contains an invalid _geo field: `{}`.", + document_id, object + ) + }, Self::InvalidDocumentId { document_id } => { let document_id = match document_id { Value::String(id) => id.clone(), @@ -268,10 +278,9 @@ ranking rules settings to use the sort parameter at search time.", write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) } Self::MissingPrimaryKey => f.write_str("Missing primary key."), - Self::MaxDatabaseSizeReached => f.write_str("Maximum database size reached."), - // TODO where can we find it instead of writing the text ourselves? - Self::NoSpaceLeftOnDevice => f.write_str("No space left on device."), - Self::InvalidStoreFile => f.write_str("Store file is not a valid database file."), + Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."), + Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."), + Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."), Self::PrimaryKeyCannotBeChanged(primary_key) => { write!(f, "Index already has a primary key: `{}`.", primary_key) } From 702589104d45e5c1f51eac2ecaa269727ebf34a8 Mon Sep 17 00:00:00 2001 From: many Date: Wed, 3 Nov 2021 14:20:01 +0100 Subject: [PATCH 20/22] Update version for the next release (v0.20.1) --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 24fb214b9..44f7d110b 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.1.0" +version = "0.20.1" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 2e71dfb21..da26a8baa 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.20.0" +version = "0.20.1" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 824771ce7..43980009a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.20.0" +version = "0.20.1" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index 461c22de0..c96ff094c 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.20.0" +version = "0.20.1" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 375cbf0dc..fc45b5355 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.20.0" +version = "0.20.1" authors = ["Kerollmops "] edition = "2018" From 7b3bac46a0cc6e58e1ee5456f6aa1550cdf2852a Mon Sep 17 00:00:00 2001 From: many Date: Thu, 4 Nov 2021 13:19:32 +0100 Subject: [PATCH 21/22] Change Attribute and Ranking rules errors --- milli/src/criterion.rs | 2 +- milli/src/error.rs | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index 0586fcc0f..aca2f95b5 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -17,7 +17,7 @@ pub enum CriterionError { impl fmt::Display for CriterionError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name), + Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name), Self::ReservedName { name } => { write!( f, diff --git a/milli/src/error.rs b/milli/src/error.rs index 59744a32e..9e8ad515d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -219,7 +219,7 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::AttributeLimitReached => f.write_str("Maximum number of attributes reached."), + Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."), Self::CriterionError(error) => write!(f, "{}", error), Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."), Self::InvalidFacetsDistribution { invalid_facets_name } => { @@ -277,7 +277,7 @@ ranking rules settings to use the sort parameter at search time.", let json = serde_json::to_string(document).unwrap(); write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) } - Self::MissingPrimaryKey => f.write_str("Missing primary key."), + Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."), Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."), Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."), Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."), From 743ed9f57f04decf5fb6b0d43b0a5a609cbebf3c Mon Sep 17 00:00:00 2001 From: many Date: Thu, 4 Nov 2021 14:04:21 +0100 Subject: [PATCH 22/22] Bump milli version --- cli/Cargo.toml | 2 +- helpers/Cargo.toml | 2 +- http-ui/Cargo.toml | 2 +- infos/Cargo.toml | 2 +- milli/Cargo.toml | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 44f7d110b..eb03842ca 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.20.1" +version = "0.20.2" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index da26a8baa..5b33d2a4f 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.20.1" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 43980009a..04e1c708a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.20.1" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/infos/Cargo.toml b/infos/Cargo.toml index c96ff094c..645bc4cdd 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.20.1" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index fc45b5355..5aa04e569 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.20.1" +version = "0.20.2" authors = ["Kerollmops "] edition = "2018"