diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 00bd4e72a..655275967 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,7 +1,7 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, Cursor, Read, Seek}; +use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; @@ -146,44 +146,34 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader anyhow::Result> { +fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; Ok(writer.into_inner()) } -fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { +fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) } -fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { +fn documents_from_csv(reader: impl Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let iter = CSVDocumentDeserializer::from_reader(reader)?; - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 24fb214b9..eb03842ca 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cli" -version = "0.1.0" +version = "0.20.2" edition = "2018" description = "A CLI to interact with a milli index" diff --git a/cli/src/main.rs b/cli/src/main.rs index cae4d081f..5e11dc3fb 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -1,5 +1,5 @@ use std::fs::File; -use std::io::{stdin, Cursor, Read}; +use std::io::{stdin, BufRead, BufReader, Cursor, Read}; use std::path::PathBuf; use std::str::FromStr; @@ -9,7 +9,6 @@ use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; -use serde_json::{Map, Value}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -202,11 +201,11 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; @@ -217,8 +216,7 @@ fn documents_from_json(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) @@ -226,17 +224,7 @@ fn documents_from_json(reader: impl Read) -> Result> { fn documents_from_csv(reader: impl Read) -> Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let mut records = csv::Reader::from_reader(reader); - let iter = records.deserialize::>(); - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/helpers/Cargo.toml b/helpers/Cargo.toml index 2f3c26a7b..5b33d2a4f 100644 --- a/helpers/Cargo.toml +++ b/helpers/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "helpers" -version = "0.18.0" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/Cargo.toml b/http-ui/Cargo.toml index 83d5f67c4..04e1c708a 100644 --- a/http-ui/Cargo.toml +++ b/http-ui/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "http-ui" description = "The HTTP user interface of the milli search engine" -version = "0.18.0" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/http-ui/src/documents_from_csv.rs b/http-ui/src/documents_from_csv.rs deleted file mode 100644 index 2b62f23c2..000000000 --- a/http-ui/src/documents_from_csv.rs +++ /dev/null @@ -1,285 +0,0 @@ -use std::io::{Read, Result as IoResult}; -use std::num::ParseFloatError; - -use serde_json::{Map, Value}; - -enum AllowedType { - String, - Number, -} - -fn parse_csv_header(header: &str) -> (String, AllowedType) { - // if there are several separators we only split on the last one. - match header.rsplit_once(':') { - Some((field_name, field_type)) => match field_type { - "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), - // we may return an error in this case. - _otherwise => (header.to_string(), AllowedType::String), - }, - None => (header.to_string(), AllowedType::String), - } -} - -pub struct CSVDocumentDeserializer -where - R: Read, -{ - documents: csv::StringRecordsIntoIter, - headers: Vec<(String, AllowedType)>, -} - -impl CSVDocumentDeserializer { - pub fn from_reader(reader: R) -> IoResult { - let mut records = csv::Reader::from_reader(reader); - - let headers = records.headers()?.into_iter().map(parse_csv_header).collect(); - - Ok(Self { documents: records.into_records(), headers }) - } -} - -impl Iterator for CSVDocumentDeserializer { - type Item = anyhow::Result>; - - fn next(&mut self) -> Option { - let csv_document = self.documents.next()?; - - match csv_document { - Ok(csv_document) => { - let mut document = Map::new(); - - for ((field_name, field_type), value) in - self.headers.iter().zip(csv_document.into_iter()) - { - let parsed_value: Result = match field_type { - AllowedType::Number => { - value.parse::().map(Value::from).map_err(Into::into) - } - AllowedType::String => Ok(Value::String(value.to_string())), - }; - - match parsed_value { - Ok(value) => drop(document.insert(field_name.to_string(), value)), - Err(_e) => { - return Some(Err(anyhow::anyhow!( - "Value '{}' is not a valid number", - value - ))) - } - } - } - - Some(Ok(document)) - } - Err(e) => Some(Err(anyhow::anyhow!("Error parsing csv document: {}", e))), - } - } -} - -#[cfg(test)] -mod test { - use serde_json::json; - - use super::*; - - #[test] - fn simple_csv_document() { - let documents = r#"city,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn coma_in_field() { - let documents = r#"city,country,pop -"Boston","United, States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United, States", - "pop": "4628910", - }) - ); - } - - #[test] - fn quote_in_field() { - let documents = r#"city,country,pop -"Boston","United"" States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United\" States", - "pop": "4628910", - }) - ); - } - - #[test] - fn integer_in_field() { - let documents = r#"city,country,pop:number -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": 4628910.0, - }) - ); - } - - #[test] - fn float_in_field() { - let documents = r#"city,country,pop:number -"Boston","United States","4628910.01""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city": "Boston", - "country": "United States", - "pop": 4628910.01, - }) - ); - } - - #[test] - fn several_double_dot_in_header() { - let documents = r#"city:love:string,country:state,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:love": "Boston", - "country:state": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn ending_by_double_dot_in_header() { - let documents = r#"city:,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn starting_by_double_dot_in_header() { - let documents = r#":city,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - ":city": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn starting_by_double_dot_in_header2() { - let documents = r#":string,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn double_double_dot_in_header() { - let documents = r#"city::string,country,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert_eq!( - Value::Object(csv_iter.next().unwrap().unwrap()), - json!({ - "city:": "Boston", - "country": "United States", - "pop": "4628910", - }) - ); - } - - #[test] - fn bad_type_in_header() { - let documents = r#"city,country:number,pop -"Boston","United States","4628910""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } - - #[test] - fn bad_column_count1() { - let documents = r#"city,country,pop -"Boston","United States","4628910", "too much""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } - - #[test] - fn bad_column_count2() { - let documents = r#"city,country,pop -"Boston","United States""#; - - let mut csv_iter = CSVDocumentDeserializer::from_reader(documents.as_bytes()).unwrap(); - - assert!(csv_iter.next().unwrap().is_err()); - } -} diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index e84c94e50..8efd8ed69 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1,10 +1,9 @@ -mod documents_from_csv; mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; -use std::io::Cursor; +use std::io::{BufRead, BufReader, Cursor}; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -40,7 +39,6 @@ use warp::http::Response; use warp::Filter; use self::update_store::UpdateStore; -use crate::documents_from_csv::CSVDocumentDeserializer; #[cfg(target_os = "linux")] #[global_allocator] @@ -1048,11 +1046,11 @@ fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let values = serde_json::Deserializer::from_reader(reader) - .into_iter::>(); - for document in values { - let document = document?; - documents.add_documents(document)?; + let mut buf = String::new(); + let mut reader = BufReader::new(reader); + + while reader.read_line(&mut buf)? > 0 { + documents.extend_from_json(&mut buf.as_bytes())?; } documents.finish()?; @@ -1063,8 +1061,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - let json: serde_json::Value = serde_json::from_reader(reader)?; - documents.add_documents(json)?; + documents.extend_from_json(reader)?; documents.finish()?; Ok(writer.into_inner()) @@ -1072,16 +1069,7 @@ fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; - - let iter = CSVDocumentDeserializer::from_reader(reader)?; - - for doc in iter { - let doc = doc?; - documents.add_documents(doc)?; - } - - documents.finish()?; + milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; Ok(writer.into_inner()) } diff --git a/infos/Cargo.toml b/infos/Cargo.toml index b0220993f..645bc4cdd 100644 --- a/infos/Cargo.toml +++ b/infos/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "infos" -version = "0.18.0" +version = "0.20.2" authors = ["Clément Renault "] edition = "2018" diff --git a/milli/Cargo.toml b/milli/Cargo.toml index 6913178b0..36e63916c 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "milli" -version = "0.18.0" +version = "0.20.2" authors = ["Kerollmops "] edition = "2018" @@ -46,6 +46,7 @@ itertools = "0.10.0" # logging log = "0.4.14" logging_timer = "1.0.0" +csv = "1.1.6" [dev-dependencies] big_s = "1.0.2" diff --git a/milli/src/asc_desc.rs b/milli/src/asc_desc.rs index 00f65a459..f07e1ded8 100644 --- a/milli/src/asc_desc.rs +++ b/milli/src/asc_desc.rs @@ -28,12 +28,12 @@ impl fmt::Display for AscDescError { write!(f, "Longitude must be contained between -180 and 180 degrees.",) } Self::InvalidSyntax { name } => { - write!(f, "invalid asc/desc syntax for {}.", name) + write!(f, "Invalid syntax for the asc/desc parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) } Self::ReservedKeyword { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a asc/desc rule.", + "`{}` is a reserved keyword and thus can't be used as a asc/desc rule.", name ) } @@ -192,18 +192,18 @@ impl fmt::Display for SortError { Self::BadGeoPointUsage { name } => { write!( f, - "invalid syntax for the `_geoPoint` parameter: `{}`. \ - Usage: `_geoPoint(latitude, longitude):asc`.", + "Invalid syntax for the geo parameter: expected expression formated like \ + `_geoPoint(latitude, longitude)` and ending by `:asc` or `:desc`, found `{}`.", name ) } Self::InvalidName { name } => { - write!(f, "invalid syntax for the sort parameter `{}`.", name) + write!(f, "Invalid syntax for the sort parameter: expected expression ending by `:asc` or `:desc`, found `{}`.", name) } Self::ReservedName { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a sort expression.", + "`{}` is a reserved keyword and thus can't be used as a sort expression.", name ) } @@ -211,7 +211,7 @@ impl fmt::Display for SortError { write!( f, "`{}` is a reserved keyword and thus can't be used as a sort expression. \ - Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates.", + Use the _geoPoint(latitude, longitude) built-in rule to sort on _geo field coordinates.", name, ) } @@ -318,50 +318,4 @@ mod tests { ); } } - - #[test] - fn sort_error_message() { - let errors = [ - ( - AscDescError::InvalidSyntax { name: S("truc:machin") }, - S("invalid syntax for the sort parameter `truc:machin`."), - ), - ( - AscDescError::InvalidSyntax { name: S("hello:world") }, - S("invalid syntax for the sort parameter `hello:world`."), - ), - ( - AscDescError::ReservedKeyword { name: S("_geo") }, - S("`_geo` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), - ), - ( - AscDescError::ReservedKeyword { name: S("_geoDistance") }, - S("_geoDistance is a reserved keyword and thus can't be used as a sort expression.") - ), - ( - AscDescError::ReservedKeyword { name: S("_geoRadius(12, 13)") }, - S("`_geoRadius` is a reserved keyword and thus can't be used as a sort expression. Use the `_geoPoint(latitude, longitude)` built-in rule to sort on `_geo` field coordinates."), - ), - ( - AscDescError::InvalidLatitude, - S("Latitude must be contained between -90 and 90 degrees."), - ), - ( - AscDescError::InvalidLongitude, - S("Longitude must be contained between -180 and 180 degrees."), - ), - ]; - - for (asc_desc_error, expected_message) in errors { - let sort_error = SortError::from(asc_desc_error); - assert_eq!( - sort_error.to_string(), - expected_message, - "was expecting {} for the error {:?} but instead got {}", - expected_message, - sort_error, - sort_error.to_string() - ); - } - } } diff --git a/milli/src/criterion.rs b/milli/src/criterion.rs index aff7fcf68..aca2f95b5 100644 --- a/milli/src/criterion.rs +++ b/milli/src/criterion.rs @@ -17,23 +17,27 @@ pub enum CriterionError { impl fmt::Display for CriterionError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InvalidName { name } => write!(f, "invalid ranking rule {}", name), + Self::InvalidName { name } => write!(f, "`{}` ranking rule is invalid. Valid ranking rules are Words, Typo, Sort, Proximity, Attribute, Exactness and custom ranking rules.", name), Self::ReservedName { name } => { - write!(f, "{} is a reserved keyword and thus can't be used as a ranking rule", name) + write!( + f, + "`{}` is a reserved keyword and thus can't be used as a ranking rule", + name + ) } Self::ReservedNameForSort { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a ranking rule. \ -{} can only be used for sorting at search time", + "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{}` can only be used for sorting at search time", name, name ) } Self::ReservedNameForFilter { name } => { write!( f, - "{} is a reserved keyword and thus can't be used as a ranking rule. \ -{} can only be used for filtering at search time", + "`{}` is a reserved keyword and thus can't be used as a ranking rule. \ +`{}` can only be used for filtering at search time", name, name ) } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index ba1319eff..f95fa9190 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,16 +1,20 @@ +use std::collections::BTreeMap; use std::io; +use std::io::{Cursor, Write}; use byteorder::{BigEndian, WriteBytesExt}; -use serde::ser::Serialize; +use serde::Deserializer; +use serde_json::Value; -use super::serde::DocumentSerializer; +use super::serde::DocumentVisitor; use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; +use crate::FieldId; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. /// /// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to -/// iterate other the documents. +/// iterate over the documents. /// /// ## example: /// ``` @@ -18,43 +22,48 @@ use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; /// use serde_json::json; /// use std::io::Cursor; /// +/// let json = r##"{"id": 1, "name": "foo"}"##; /// let mut writer = Cursor::new(Vec::new()); /// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.add_documents(json!({"id": 1, "name": "foo"})).unwrap(); +/// builder.extend_from_json(&mut json.as_bytes()).unwrap(); /// builder.finish().unwrap(); /// ``` pub struct DocumentBatchBuilder { - serializer: DocumentSerializer, + inner: ByteCounter, + index: DocumentsBatchIndex, + obkv_buffer: Vec, + value_buffer: Vec, + values: BTreeMap, + count: usize, } impl DocumentBatchBuilder { pub fn new(writer: W) -> Result { - let index = DocumentsBatchIndex::new(); + let index = DocumentsBatchIndex::default(); let mut writer = ByteCounter::new(writer); // add space to write the offset of the metadata at the end of the writer writer.write_u64::(0)?; - let serializer = - DocumentSerializer { writer, buffer: Vec::new(), index, count: 0, allow_seq: true }; - - Ok(Self { serializer }) + Ok(Self { + inner: writer, + index, + obkv_buffer: Vec::new(), + value_buffer: Vec::new(), + values: BTreeMap::new(), + count: 0, + }) } /// Returns the number of documents that have been written to the builder. pub fn len(&self) -> usize { - self.serializer.count + self.count } /// This method must be called after the document addition is terminated. It will put the /// metadata at the end of the file, and write the metadata offset at the beginning on the /// file. - pub fn finish(self) -> Result<(), Error> { - let DocumentSerializer { - writer: ByteCounter { mut writer, count: offset }, - index, - count, - .. - } = self.serializer; + pub fn finish(self) -> Result { + let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; let meta = DocumentsMetadata { count, index }; @@ -65,16 +74,481 @@ impl DocumentBatchBuilder { writer.flush()?; - Ok(()) + Ok(count) } - /// Adds documents to the builder. + /// Extends the builder with json documents from a reader. + pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { + let mut de = serde_json::Deserializer::from_reader(reader); + + let mut visitor = DocumentVisitor { + inner: &mut self.inner, + index: &mut self.index, + obkv_buffer: &mut self.obkv_buffer, + value_buffer: &mut self.value_buffer, + values: &mut self.values, + count: &mut self.count, + }; + + de.deserialize_any(&mut visitor).map_err(Error::JsonError)? + } + + /// Creates a builder from a reader of CSV documents. /// - /// The internal index is updated with the fields found - /// in the documents. Document must either be a map or a sequences of map, anything else will - /// fail. - pub fn add_documents(&mut self, document: T) -> Result<(), Error> { - document.serialize(&mut self.serializer)?; - Ok(()) + /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform + /// optimisations, and extending from another CSV is not allowed. + pub fn from_csv(reader: R, writer: W) -> Result { + let mut this = Self::new(writer)?; + // Ensure that this is the first and only addition made with this builder + debug_assert!(this.index.is_empty()); + + let mut records = csv::Reader::from_reader(reader); + + let headers = records + .headers()? + .into_iter() + .map(parse_csv_header) + .map(|(k, t)| (this.index.insert(&k), t)) + .collect::>(); + + for (i, record) in records.into_records().enumerate() { + let record = record?; + this.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); + for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { + let value = match ty { + AllowedType::Number => { + value.parse::().map(Value::from).map_err(|error| { + Error::ParseFloat { + error, + // +1 for the header offset. + line: i + 1, + value: value.to_string(), + } + })? + } + AllowedType::String => Value::String(value.to_string()), + }; + + this.value_buffer.clear(); + serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; + writer.insert(*fid, &this.value_buffer)?; + } + + this.inner.write_u32::(this.obkv_buffer.len() as u32)?; + this.inner.write_all(&this.obkv_buffer)?; + + this.count += 1; + } + + Ok(this) + } +} + +#[derive(Debug)] +enum AllowedType { + String, + Number, +} + +fn parse_csv_header(header: &str) -> (String, AllowedType) { + // if there are several separators we only split on the last one. + match header.rsplit_once(':') { + Some((field_name, field_type)) => match field_type { + "string" => (field_name.to_string(), AllowedType::String), + "number" => (field_name.to_string(), AllowedType::Number), + // if the pattern isn't reconized, we keep the whole field. + _otherwise => (header.to_string(), AllowedType::String), + }, + None => (header.to_string(), AllowedType::String), + } +} + +#[cfg(test)] +mod test { + use std::io::Cursor; + + use serde_json::{json, Map}; + + use super::*; + use crate::documents::DocumentBatchReader; + + fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { + let mut map = Map::new(); + + for (fid, value) in obkv.iter() { + let field_name = index.name(fid).unwrap().clone(); + let value: Value = serde_json::from_slice(value).unwrap(); + + map.insert(field_name, value); + } + + Value::Object(map) + } + + #[test] + fn add_single_documents_json() { + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let json = serde_json::json!({ + "id": 1, + "field": "hello!", + }); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + let json = serde_json::json!({ + "blabla": false, + "field": "hello!", + "id": 1, + }); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + assert_eq!(builder.len(), 2); + + builder.finish().unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 2); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 3); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn add_documents_seq_json() { + let mut cursor = Cursor::new(Vec::new()); + let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + + let json = serde_json::json!([{ + "id": 1, + "field": "hello!", + },{ + "blabla": false, + "field": "hello!", + "id": 1, + } + ]); + + builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + + assert_eq!(builder.len(), 2); + + builder.finish().unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 2); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 3); + assert_eq!(document.iter().count(), 3); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn add_documents_csv() { + let mut cursor = Cursor::new(Vec::new()); + + let csv = "id:number,field:string\n1,hello!\n2,blabla"; + + let builder = + DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); + builder.finish().unwrap(); + + cursor.set_position(0); + + let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); + + let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(index.len(), 2); + assert_eq!(document.iter().count(), 2); + + let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); + assert_eq!(document.iter().count(), 2); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn simple_csv_document() { + let documents = r#"city,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + + assert!(reader.next_document_with_index().unwrap().is_none()); + } + + #[test] + fn coma_in_field() { + let documents = r#"city,country,pop +"Boston","United, States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United, States", + "pop": "4628910", + }) + ); + } + + #[test] + fn quote_in_field() { + let documents = r#"city,country,pop +"Boston","United"" States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United\" States", + "pop": "4628910", + }) + ); + } + + #[test] + fn integer_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.0, + }) + ); + } + + #[test] + fn float_in_field() { + let documents = r#"city,country,pop:number +"Boston","United States","4628910.01""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city": "Boston", + "country": "United States", + "pop": 4628910.01, + }) + ); + } + + #[test] + fn several_colon_in_header() { + let documents = r#"city:love:string,country:state,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:love": "Boston", + "country:state": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn ending_by_colon_in_header() { + let documents = r#"city:,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn starting_by_colon_in_header() { + let documents = r#":city,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + ":city": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[ignore] + #[test] + fn starting_by_colon_in_header2() { + let documents = r#":string,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + + assert!(reader.next_document_with_index().is_err()); + } + + #[test] + fn double_colon_in_header() { + let documents = r#"city::string,country,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) + .unwrap() + .finish() + .unwrap(); + let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); + let val = obkv_to_value(&doc, index); + + assert_eq!( + val, + json!({ + "city:": "Boston", + "country": "United States", + "pop": "4628910", + }) + ); + } + + #[test] + fn bad_type_in_header() { + let documents = r#"city,country:number,pop +"Boston","United States","4628910""#; + + let mut buf = Vec::new(); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); + } + + #[test] + fn bad_column_count1() { + let documents = r#"city,country,pop +"Boston","United States","4628910", "too much""#; + + let mut buf = Vec::new(); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); + } + + #[test] + fn bad_column_count2() { + let documents = r#"city,country,pop +"Boston","United States""#; + + let mut buf = Vec::new(); + assert!( + DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() + ); } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index f79c210fe..14d97ee7d 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,7 +7,8 @@ mod builder; mod reader; mod serde; -use std::{fmt, io}; +use std::fmt::{self, Debug}; +use std::io; use ::serde::{Deserialize, Serialize}; use bimap::BiHashMap; @@ -17,7 +18,38 @@ pub use reader::DocumentBatchReader; use crate::FieldId; /// A bidirectional map that links field ids to their name in a document batch. -pub type DocumentsBatchIndex = BiHashMap; +#[derive(Default, Debug, Serialize, Deserialize)] +pub struct DocumentsBatchIndex(pub BiHashMap); + +impl DocumentsBatchIndex { + /// Insert the field in the map, or return it's field id if it doesn't already exists. + pub fn insert(&mut self, field: &str) -> FieldId { + match self.0.get_by_right(field) { + Some(field_id) => *field_id, + None => { + let field_id = self.0.len() as FieldId; + self.0.insert(field_id, field.to_string()); + field_id + } + } + } + + pub fn is_empty(&self) -> bool { + self.0.is_empty() + } + + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn iter(&self) -> bimap::hash::Iter { + self.0.iter() + } + + pub fn name(&self, id: FieldId) -> Option<&String> { + self.0.get_by_left(&id) + } +} #[derive(Debug, Serialize, Deserialize)] struct DocumentsMetadata { @@ -50,14 +82,22 @@ impl io::Write for ByteCounter { #[derive(Debug)] pub enum Error { + ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, Custom(String), JsonError(serde_json::Error), + CsvError(csv::Error), Serialize(bincode::Error), Io(io::Error), DocumentTooLarge, } +impl From for Error { + fn from(e: csv::Error) -> Self { + Self::CsvError(e) + } +} + impl From for Error { fn from(other: io::Error) -> Self { Self::Io(other) @@ -70,15 +110,25 @@ impl From for Error { } } +impl From for Error { + fn from(other: serde_json::Error) -> Self { + Self::JsonError(other) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { + Error::ParseFloat { error, line, value } => { + write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) + } Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), - Error::Io(e) => e.fmt(f), + Error::Io(e) => write!(f, "{}", e), Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), - Error::Serialize(e) => e.fmt(f), + Error::Serialize(e) => write!(f, "{}", e), + Error::CsvError(e) => write!(f, "{}", e), } } } @@ -92,7 +142,8 @@ macro_rules! documents { let documents = serde_json::json!($data); let mut writer = std::io::Cursor::new(Vec::new()); let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - builder.add_documents(documents).unwrap(); + let documents = serde_json::to_vec(&documents).unwrap(); + builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); builder.finish().unwrap(); writer.set_position(0); @@ -103,6 +154,8 @@ macro_rules! documents { #[cfg(test)] mod test { + use std::io::Cursor; + use serde_json::{json, Value}; use super::*; @@ -119,12 +172,14 @@ mod test { "bool": true }); + let json = serde_json::to_vec(&json).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(json).unwrap(); + builder.extend_from_json(Cursor::new(json)).unwrap(); builder.finish().unwrap(); @@ -148,13 +203,16 @@ mod test { "toto": false, }); + let doc1 = serde_json::to_vec(&doc1).unwrap(); + let doc2 = serde_json::to_vec(&doc2).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(doc1).unwrap(); - builder.add_documents(doc2).unwrap(); + builder.extend_from_json(Cursor::new(doc1)).unwrap(); + builder.extend_from_json(Cursor::new(doc2)).unwrap(); builder.finish().unwrap(); @@ -177,12 +235,14 @@ mod test { { "tata": "hello" }, ]); + let docs = serde_json::to_vec(&docs).unwrap(); + let mut v = Vec::new(); let mut cursor = io::Cursor::new(&mut v); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(docs).unwrap(); + builder.extend_from_json(Cursor::new(docs)).unwrap(); builder.finish().unwrap(); @@ -210,11 +270,13 @@ mod test { { "tata": "hello" }, ]]); - assert!(builder.add_documents(docs).is_err()); + let docs = serde_json::to_vec(&docs).unwrap(); + assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); let docs = json!("hello"); + let docs = serde_json::to_vec(&docs).unwrap(); - assert!(builder.add_documents(docs).is_err()); + assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); } #[test] diff --git a/milli/src/documents/serde.rs b/milli/src/documents/serde.rs index 036ec246a..d57bf1ffb 100644 --- a/milli/src/documents/serde.rs +++ b/milli/src/documents/serde.rs @@ -1,474 +1,134 @@ use std::collections::BTreeMap; -use std::convert::TryInto; -use std::io::Cursor; -use std::{fmt, io}; +use std::fmt; +use std::io::{Cursor, Write}; -use byteorder::{BigEndian, WriteBytesExt}; -use obkv::KvWriter; -use serde::ser::{Impossible, Serialize, SerializeMap, SerializeSeq, Serializer}; +use byteorder::WriteBytesExt; +use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; +use serde::Deserialize; use serde_json::Value; use super::{ByteCounter, DocumentsBatchIndex, Error}; use crate::FieldId; -pub struct DocumentSerializer { - pub writer: ByteCounter, - pub buffer: Vec, - pub index: DocumentsBatchIndex, - pub count: usize, - pub allow_seq: bool, -} - -impl<'a, W: io::Write> Serializer for &'a mut DocumentSerializer { - type Ok = (); - - type Error = Error; - - type SerializeSeq = SeqSerializer<'a, W>; - type SerializeTuple = Impossible<(), Self::Error>; - type SerializeTupleStruct = Impossible<(), Self::Error>; - type SerializeTupleVariant = Impossible<(), Self::Error>; - type SerializeMap = MapSerializer<'a, &'a mut ByteCounter>; - type SerializeStruct = Impossible<(), Self::Error>; - type SerializeStructVariant = Impossible<(), Self::Error>; - fn serialize_map(self, _len: Option) -> Result { - self.buffer.clear(); - let cursor = io::Cursor::new(&mut self.buffer); - self.count += 1; - let map_serializer = MapSerializer { - map: KvWriter::new(cursor), - index: &mut self.index, - writer: &mut self.writer, - mapped_documents: BTreeMap::new(), - }; - - Ok(map_serializer) - } - - fn serialize_seq(self, _len: Option) -> Result { - if self.allow_seq { - // Only allow sequence of documents of depth 1. - self.allow_seq = false; - Ok(SeqSerializer { serializer: self }) - } else { - Err(Error::InvalidDocumentFormat) +macro_rules! tri { + ($e:expr) => { + match $e { + Ok(r) => r, + Err(e) => return Ok(Err(e.into())), } - } + }; +} - fn serialize_bool(self, _v: bool) -> Result { - Err(Error::InvalidDocumentFormat) - } +struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); - fn serialize_i8(self, _v: i8) -> Result { - Err(Error::InvalidDocumentFormat) - } +impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { + type Value = FieldId; - fn serialize_i16(self, _v: i16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i32(self, _v: i32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i64(self, _v: i64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u8(self, _v: u8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u16(self, _v: u16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u32(self, _v: u32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u64(self, _v: u64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f32(self, _v: f32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f64(self, _v: f64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_char(self, _v: char) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_str(self, _v: &str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_none(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_some(self, _value: &T) -> Result + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, + D: serde::Deserializer<'de>, { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) + deserializer.deserialize_str(self) } } -pub struct SeqSerializer<'a, W> { - serializer: &'a mut DocumentSerializer, -} +impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { + type Value = FieldId; -impl<'a, W: io::Write> SerializeSeq for SeqSerializer<'a, W> { - type Ok = (); - type Error = Error; - - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + fn visit_str(self, v: &str) -> Result where - T: Serialize, + E: serde::de::Error, { - value.serialize(&mut *self.serializer)?; - Ok(()) + Ok(self.0.insert(v)) } - fn end(self) -> Result { - Ok(()) + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a string") } } -pub struct MapSerializer<'a, W> { - map: KvWriter>, FieldId>, - index: &'a mut DocumentsBatchIndex, - writer: W, - mapped_documents: BTreeMap, +struct ValueDeserializer; + +impl<'de> DeserializeSeed<'de> for ValueDeserializer { + type Value = serde_json::Value; + + fn deserialize(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + serde_json::Value::deserialize(deserializer) + } } -/// This implementation of SerializeMap uses serilialize_entry instead of seriliaze_key and -/// serialize_value, therefore these to methods remain unimplemented. -impl<'a, W: io::Write> SerializeMap for MapSerializer<'a, W> { - type Ok = (); - type Error = Error; +pub struct DocumentVisitor<'a, W> { + pub inner: &'a mut ByteCounter, + pub index: &'a mut DocumentsBatchIndex, + pub obkv_buffer: &'a mut Vec, + pub value_buffer: &'a mut Vec, + pub values: &'a mut BTreeMap, + pub count: &'a mut usize, +} - fn serialize_key(&mut self, _key: &T) -> Result<(), Self::Error> { - unreachable!() - } +impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { + /// This Visitor value is nothing, since it write the value to a file. + type Value = Result<(), Error>; - fn serialize_value(&mut self, _value: &T) -> Result<(), Self::Error> { - unreachable!() - } - - fn end(mut self) -> Result { - let mut buf = Vec::new(); - for (key, value) in self.mapped_documents { - buf.clear(); - let mut cursor = Cursor::new(&mut buf); - serde_json::to_writer(&mut cursor, &value).map_err(Error::JsonError)?; - self.map.insert(key, cursor.into_inner()).map_err(Error::Io)?; + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + while let Some(v) = seq.next_element_seed(&mut *self)? { + tri!(v) } - let data = self.map.into_inner().map_err(Error::Io)?.into_inner(); - let data_len: u32 = data.len().try_into().map_err(|_| Error::DocumentTooLarge)?; - - self.writer.write_u32::(data_len).map_err(Error::Io)?; - self.writer.write_all(&data).map_err(Error::Io)?; - - Ok(()) + Ok(Ok(())) } - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> + fn visit_map(self, mut map: A) -> Result where - K: Serialize, - V: Serialize, + A: MapAccess<'de>, { - let field_serializer = FieldSerializer { index: &mut self.index }; - let field_id: FieldId = key.serialize(field_serializer)?; + while let Some((key, value)) = + map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? + { + self.values.insert(key, value); + } - let value = serde_json::to_value(value).map_err(Error::JsonError)?; + self.obkv_buffer.clear(); + let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); + for (key, value) in self.values.iter() { + self.value_buffer.clear(); + // This is guaranteed to work + tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); + tri!(obkv.insert(*key, &self.value_buffer)); + } - self.mapped_documents.insert(field_id, value); + let reader = tri!(obkv.into_inner()).into_inner(); - Ok(()) + tri!(self.inner.write_u32::(reader.len() as u32)); + tri!(self.inner.write_all(reader)); + + *self.count += 1; + self.values.clear(); + + Ok(Ok(())) + } + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a documents, or a sequence of documents.") } } -struct FieldSerializer<'a> { - index: &'a mut DocumentsBatchIndex, -} +impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +where + W: Write, +{ + type Value = Result<(), Error>; -impl<'a> serde::Serializer for FieldSerializer<'a> { - type Ok = FieldId; - - type Error = Error; - - type SerializeSeq = Impossible; - type SerializeTuple = Impossible; - type SerializeTupleStruct = Impossible; - type SerializeTupleVariant = Impossible; - type SerializeMap = Impossible; - type SerializeStruct = Impossible; - type SerializeStructVariant = Impossible; - - fn serialize_str(self, ws: &str) -> Result { - let field_id = match self.index.get_by_right(ws) { - Some(field_id) => *field_id, - None => { - let field_id = self.index.len() as FieldId; - self.index.insert(field_id, ws.to_string()); - field_id - } - }; - - Ok(field_id) - } - - fn serialize_bool(self, _v: bool) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i8(self, _v: i8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i16(self, _v: i16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i32(self, _v: i32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_i64(self, _v: i64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u8(self, _v: u8) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u16(self, _v: u16) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u32(self, _v: u32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_u64(self, _v: u64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f32(self, _v: f32) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_f64(self, _v: f64) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_char(self, _v: char) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_none(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_some(self, _value: &T) -> Result + fn deserialize(self, deserializer: D) -> Result where - T: Serialize, + D: serde::Deserializer<'de>, { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit(self) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(Error::InvalidDocumentFormat) - } -} - -impl serde::ser::Error for Error { - fn custom(msg: T) -> Self { - Error::Custom(msg.to_string()) + deserializer.deserialize_map(self) } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 3d744da5c..47c9a5993 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -1,4 +1,4 @@ -use std::collections::HashSet; +use std::collections::BTreeSet; use std::convert::Infallible; use std::error::Error as StdError; use std::{fmt, io, str}; @@ -57,19 +57,17 @@ pub enum UserError { CriterionError(CriterionError), DocumentLimitReached, InvalidDocumentId { document_id: Value }, - InvalidFacetsDistribution { invalid_facets_name: HashSet }, + InvalidFacetsDistribution { invalid_facets_name: BTreeSet }, InvalidGeoField { document_id: Value, object: Value }, InvalidFilter(String), - InvalidSortName { name: String }, - InvalidSortableAttribute { field: String, valid_fields: HashSet }, + InvalidSortableAttribute { field: String, valid_fields: BTreeSet }, SortRankingRuleMissing, InvalidStoreFile, MaxDatabaseSizeReached, - MissingDocumentId { document: Object }, + MissingDocumentId { primary_key: String, document: Object }, MissingPrimaryKey, NoSpaceLeftOnDevice, - PrimaryKeyCannotBeChanged, - PrimaryKeyCannotBeReset, + PrimaryKeyCannotBeChanged(String), SerdeJson(serde_json::Error), SortError(SortError), UnknownInternalDocumentId { document_id: DocumentId }, @@ -168,7 +166,7 @@ impl From for Error { impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InternalError(error) => write!(f, "internal: {}", error), + Self::InternalError(error) => write!(f, "internal: {}.", error), Self::IoError(error) => error.fmt(f), Self::UserError(error) => error.fmt(f), } @@ -181,15 +179,15 @@ impl fmt::Display for InternalError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { Self::DatabaseMissingEntry { db_name, key } => { - write!(f, "missing {} in the {} database", key.unwrap_or("key"), db_name) + write!(f, "Missing {} in the {} database.", key.unwrap_or("key"), db_name) } Self::FieldIdMapMissingEntry(error) => error.fmt(f), Self::Fst(error) => error.fmt(f), Self::GrenadInvalidCompressionType => { - f.write_str("invalid compression type have been specified to grenad") + f.write_str("Invalid compression type have been specified to grenad.") } Self::IndexingMergingKeys { process } => { - write!(f, "invalid merge while processing {}", process) + write!(f, "Invalid merge while processing {}.", process) } Self::Serialization(error) => error.fmt(f), Self::InvalidDatabaseTyping => HeedError::InvalidDatabaseTyping.fmt(f), @@ -207,69 +205,75 @@ impl StdError for InternalError {} impl fmt::Display for UserError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - Self::InvalidFilter(input) => write!(f, "{}", input), - Self::AttributeLimitReached => f.write_str("maximum number of attributes reached"), + Self::InvalidFilter(error) => f.write_str(error), + Self::AttributeLimitReached => f.write_str("A document cannot contain more than 65,535 fields."), Self::CriterionError(error) => write!(f, "{}", error), - Self::DocumentLimitReached => f.write_str("maximum number of documents reached"), + Self::DocumentLimitReached => f.write_str("Maximum number of documents reached."), Self::InvalidFacetsDistribution { invalid_facets_name } => { let name_list = invalid_facets_name.iter().map(AsRef::as_ref).collect::>().join(", "); write!( f, - "invalid facet distribution, the fields {} are not set as filterable", + "Invalid facet distribution, the fields `{}` are not set as filterable.", name_list ) } - Self::InvalidGeoField { document_id, object } => write!( - f, - "the document with the id: {} contains an invalid _geo field: {}", - document_id, object - ), - Self::InvalidDocumentId { document_id } => { - let json = serde_json::to_string(document_id).unwrap(); + Self::InvalidGeoField { document_id, object } => { + let document_id = match document_id { + Value::String(id) => id.clone(), + _ => document_id.to_string(), + }; + let object = match object { + Value::String(id) => id.clone(), + _ => object.to_string(), + }; write!( f, - "document identifier is invalid {}, \ -a document id can be of type integer or string \ -only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_)", - json + "The document with the id: `{}` contains an invalid _geo field: `{}`.", + document_id, object + ) + }, + Self::InvalidDocumentId { document_id } => { + let document_id = match document_id { + Value::String(id) => id.clone(), + _ => document_id.to_string(), + }; + write!( + f, + "Document identifier `{}` is invalid. \ +A document identifier can be of type integer or string, \ +only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_).", + document_id ) - } - Self::InvalidSortName { name } => { - write!(f, "Invalid syntax for the sort parameter: {}", name) } Self::InvalidSortableAttribute { field, valid_fields } => { let valid_names = valid_fields.iter().map(AsRef::as_ref).collect::>().join(", "); write!( f, - "Attribute {} is not sortable, available sortable attributes are: {}", + "Attribute `{}` is not sortable. Available sortable attributes are: `{}`.", field, valid_names ) } Self::SortRankingRuleMissing => f.write_str( - "You must specify where \"sort\" is listed in the \ -rankingRules setting to use the sort parameter at search time", + "The sort ranking rule must be specified in the \ +ranking rules settings to use the sort parameter at search time.", ), - Self::MissingDocumentId { document } => { + Self::MissingDocumentId { primary_key, document } => { let json = serde_json::to_string(document).unwrap(); - write!(f, "document doesn't have an identifier {}", json) + write!(f, "Document doesn't have a `{}` attribute: `{}`.", primary_key, json) } - Self::MissingPrimaryKey => f.write_str("missing primary key"), - Self::MaxDatabaseSizeReached => f.write_str("maximum database size reached"), - // TODO where can we find it instead of writing the text ourselves? - Self::NoSpaceLeftOnDevice => f.write_str("no space left on device"), - Self::InvalidStoreFile => f.write_str("store file is not a valid database file"), - Self::PrimaryKeyCannotBeChanged => { - f.write_str("primary key cannot be changed if the database contains documents") - } - Self::PrimaryKeyCannotBeReset => { - f.write_str("primary key cannot be reset if the database contains documents") + Self::MissingPrimaryKey => f.write_str("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index."), + Self::MaxDatabaseSizeReached => f.write_str("Maximum database size has been reached."), + Self::NoSpaceLeftOnDevice => f.write_str("There is no more space left on the device. Consider increasing the size of the disk/partition."), + Self::InvalidStoreFile => f.write_str("The database file is in an invalid state."), + Self::PrimaryKeyCannotBeChanged(primary_key) => { + write!(f, "Index already has a primary key: `{}`.", primary_key) } Self::SerdeJson(error) => error.fmt(f), Self::SortError(error) => write!(f, "{}", error), Self::UnknownInternalDocumentId { document_id } => { - write!(f, "an unknown internal document id have been used ({})", document_id) + write!(f, "An unknown internal document id have been used: `{}`.", document_id) } } } diff --git a/milli/src/index.rs b/milli/src/index.rs index 6ce693fbe..fe89fe734 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -953,6 +953,7 @@ pub(crate) mod tests { { "id": 1, "name": "kevin", "has_dog": true }, { "id": 2, "name": "bob" } ]); + let mut wtxn = index.write_txn().unwrap(); let builder = IndexDocuments::new(&mut wtxn, &index, 0); builder.execute(content, |_, _| ()).unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index deb51a053..11f6379e3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -68,7 +68,9 @@ mod test { "txts": sample_txts[..(rng.gen_range(0..3))], "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - builder.add_documents(doc).unwrap(); + + let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); + builder.extend_from_json(doc).unwrap(); } builder.finish().unwrap(); diff --git a/milli/src/search/facet/grammar.pest b/milli/src/search/facet/grammar.pest new file mode 100644 index 000000000..8bfdeb667 --- /dev/null +++ b/milli/src/search/facet/grammar.pest @@ -0,0 +1,33 @@ +key = _{reserved | quoted | word } +value = _{quoted | word } +quoted = _{ (PUSH("'") | PUSH("\"")) ~ string ~ POP } +string = {char*} +word = ${(LETTER | NUMBER | "_" | "-" | ".")+} + +char = _{ !(PEEK | "\\") ~ ANY + | "\\" ~ (PEEK | "\\" | "/" | "b" | "f" | "n" | "r" | "t") + | "\\" ~ ("u" ~ ASCII_HEX_DIGIT{4})} + +reserved = { "_geoDistance" | ("_geoPoint" ~ parameters) | "_geo" } +// we deliberately choose to allow empty parameters to generate more specific error message later +parameters = {("(" ~ (value ~ ",")* ~ value? ~ ")") | ""} +condition = _{between | eq | greater | less | geq | leq | neq} +between = {key ~ value ~ "TO" ~ value} +geq = {key ~ ">=" ~ value} +leq = {key ~ "<=" ~ value} +neq = {key ~ "!=" ~ value} +eq = {key ~ "=" ~ value} +greater = {key ~ ">" ~ value} +less = {key ~ "<" ~ value} +geo_radius = {"_geoRadius" ~ parameters } + +prgm = {SOI ~ expr ~ EOI} +expr = _{ ( term ~ (operation ~ term)* ) } +term = { ("(" ~ expr ~ ")") | condition | not | geo_radius } +operation = _{ and | or } +and = {"AND"} +or = {"OR"} + +not = {"NOT" ~ term} + +WHITESPACE = _{ " " } diff --git a/milli/src/search/mod.rs b/milli/src/search/mod.rs index a31ead1ec..7c8722187 100644 --- a/milli/src/search/mod.rs +++ b/milli/src/search/mod.rs @@ -151,13 +151,13 @@ impl<'a> Search<'a> { Member::Field(ref field) if !sortable_fields.contains(field) => { return Err(UserError::InvalidSortableAttribute { field: field.to_string(), - valid_fields: sortable_fields, + valid_fields: sortable_fields.into_iter().collect(), })? } Member::Geo(_) if !sortable_fields.contains("_geo") => { return Err(UserError::InvalidSortableAttribute { field: "_geo".to_string(), - valid_fields: sortable_fields, + valid_fields: sortable_fields.into_iter().collect(), })? } _ => (), diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 92bcab0e9..440546b10 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -877,7 +877,8 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(big_object).unwrap(); + let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); + builder.extend_from_json(big_object).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); @@ -905,8 +906,9 @@ mod tests { let mut cursor = Cursor::new(Vec::new()); + let big_object = serde_json::to_string(&big_object).unwrap(); let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.add_documents(big_object).unwrap(); + builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); builder.finish().unwrap(); cursor.set_position(0); let content = DocumentBatchReader::from_reader(cursor).unwrap(); diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index c0c88abed..855fb8db9 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -75,7 +75,7 @@ fn create_fields_mapping( .collect() } -fn find_primary_key(index: &bimap::BiHashMap) -> Option<&str> { +fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { index .iter() .sorted_by_key(|(k, _)| *k) @@ -179,7 +179,7 @@ impl Transform<'_, '_> { if !self.autogenerate_docids { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.get_by_left(&key).cloned(); + let key = addition_index.name(key).cloned(); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { @@ -187,7 +187,11 @@ impl Transform<'_, '_> { } } - return Err(UserError::MissingDocumentId { document: json }.into()); + return Err(UserError::MissingDocumentId { + primary_key: primary_key_name, + document: json, + } + .into()); } let uuid = @@ -544,6 +548,7 @@ mod test { mod primary_key_inference { use bimap::BiHashMap; + use crate::documents::DocumentsBatchIndex; use crate::update::index_documents::transform::find_primary_key; #[test] @@ -557,7 +562,7 @@ mod test { map.insert(4, "fakeId".to_string()); map.insert(0, "realId".to_string()); - assert_eq!(find_primary_key(&map), Some("realId")); + assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); } } } diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index f25bceb7b..0326b5144 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -465,7 +465,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.put_primary_key(self.wtxn, primary_key)?; Ok(()) } else { - Err(UserError::PrimaryKeyCannotBeChanged.into()) + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) } } Setting::Reset => { @@ -473,7 +474,8 @@ impl<'a, 't, 'u, 'i> Settings<'a, 't, 'u, 'i> { self.index.delete_primary_key(self.wtxn)?; Ok(()) } else { - Err(UserError::PrimaryKeyCannotBeReset.into()) + let primary_key = self.index.primary_key(self.wtxn)?.unwrap(); + Err(UserError::PrimaryKeyCannotBeChanged(primary_key.to_string()).into()) } } Setting::NotSet => Ok(()), @@ -1106,7 +1108,7 @@ mod tests { builder.reset_primary_key(); let err = builder.execute(|_, _| ()).unwrap_err(); - assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeReset))); + assert!(matches!(err, Error::UserError(UserError::PrimaryKeyCannotBeChanged(_)))); wtxn.abort().unwrap(); // But if we clear the database... diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index cda0da617..e8fb3fdfa 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,9 +61,12 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut cursor = Cursor::new(Vec::new()); let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); let reader = Cursor::new(CONTENT.as_bytes()); + for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - documents_builder.add_documents(doc.unwrap()).unwrap(); + let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); + documents_builder.extend_from_json(doc).unwrap(); } + documents_builder.finish().unwrap(); cursor.set_position(0); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index f3b04c4fa..e5dde049c 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -409,7 +409,8 @@ fn criteria_ascdesc() { "age": age, }); - batch_builder.add_documents(json).unwrap(); + let json = Cursor::new(serde_json::to_vec(&json).unwrap()); + batch_builder.extend_from_json(json).unwrap(); }); batch_builder.finish().unwrap(); diff --git a/search/Cargo.toml b/search/Cargo.toml deleted file mode 100644 index b55c4b8d6..000000000 --- a/search/Cargo.toml +++ /dev/null @@ -1,18 +0,0 @@ -[package] -name = "search" -version = "0.18.0" -authors = ["Clément Renault "] -edition = "2018" - -[dependencies] -anyhow = "1.0.38" -byte-unit = { version = "4.0.9", default-features = false, features = ["std"] } -heed = { git = "https://github.com/Kerollmops/heed", tag = "v0.12.1" } -log = "0.4.14" -milli = { path = "../milli" } -serde_json = "1.0.62" -stderrlog = "0.5.1" -structopt = { version = "0.3.21", default-features = false } - -[target.'cfg(target_os = "linux")'.dependencies] -jemallocator = "0.3.2"