From 048e174efb80daa7ca0ac302f53d36fe514d172e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 10:13:48 +0200 Subject: [PATCH 01/35] Do not allocate when parsing CSV headers --- milli/src/documents/builder.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 2be7c1dd8..391175f31 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -108,7 +108,7 @@ impl DocumentBatchBuilder { .headers()? .into_iter() .map(parse_csv_header) - .map(|(k, t)| (this.index.insert(&k), t)) + .map(|(k, t)| (this.index.insert(k), t)) .collect::>(); for (i, record) in records.into_records().enumerate() { @@ -161,16 +161,16 @@ enum AllowedType { Number, } -fn parse_csv_header(header: &str) -> (String, AllowedType) { +fn parse_csv_header(header: &str) -> (&str, AllowedType) { // if there are several separators we only split on the last one. match header.rsplit_once(':') { Some((field_name, field_type)) => match field_type { - "string" => (field_name.to_string(), AllowedType::String), - "number" => (field_name.to_string(), AllowedType::Number), + "string" => (field_name, AllowedType::String), + "number" => (field_name, AllowedType::Number), // if the pattern isn't reconized, we keep the whole field. - _otherwise => (header.to_string(), AllowedType::String), + _otherwise => (header, AllowedType::String), }, - None => (header.to_string(), AllowedType::String), + None => (header, AllowedType::String), } } From eb63af1f1024063afc14eca909e99b4b370b2ff1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 13:20:51 +0200 Subject: [PATCH 02/35] Update grenad to 0.4.2 --- milli/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/Cargo.toml b/milli/Cargo.toml index feddf91c5..d980c6041 100644 --- a/milli/Cargo.toml +++ b/milli/Cargo.toml @@ -17,7 +17,7 @@ flatten-serde-json = { path = "../flatten-serde-json" } fst = "0.4.7" fxhash = "0.2.1" geoutils = "0.4.1" -grenad = { version = "0.4.1", default-features = false, features = ["tempfile"] } +grenad = { version = "0.4.2", default-features = false, features = ["tempfile"] } heed = { git = "https://github.com/meilisearch/heed", tag = "v0.12.1", default-features = false, features = ["lmdb", "sync-read-txn"] } json-depth-checker = { path = "../json-depth-checker" } levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] } From 419ce3966c42fcb66fd1dbdb239b4287bf55d74b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:03:48 +0200 Subject: [PATCH 03/35] Rework the DocumentsBatchBuilder/Reader to use grenad --- milli/src/documents/builder.rs | 216 +++++++++++++++--------------- milli/src/documents/mod.rs | 107 ++++++--------- milli/src/documents/reader.rs | 117 +++++++++------- milli/src/documents/serde_impl.rs | 134 ------------------ 4 files changed, 218 insertions(+), 356 deletions(-) delete mode 100644 milli/src/documents/serde_impl.rs diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 391175f31..159afb8d9 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,157 +1,159 @@ -use std::collections::BTreeMap; -use std::io; -use std::io::{Cursor, Write}; +use std::io::{self, Write}; -use byteorder::{BigEndian, WriteBytesExt}; -use serde::Deserializer; -use serde_json::Value; +use grenad::{CompressionType, WriterBuilder}; +use serde_json::{to_writer, Map, Value}; -use super::serde_impl::DocumentVisitor; -use super::{ByteCounter, DocumentsBatchIndex, DocumentsMetadata, Error}; -use crate::FieldId; +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. /// -/// The writer used by the DocumentBatchBuilder can be read using a `DocumentBatchReader` to -/// iterate over the documents. +/// The writer used by the `DocumentsBatchBuilder` can be read using a `DocumentsBatchReader` +/// to iterate over the documents. /// /// ## example: /// ``` -/// use milli::documents::DocumentBatchBuilder; /// use serde_json::json; -/// use std::io::Cursor; +/// use milli::documents::DocumentsBatchBuilder; /// -/// let json = r##"{"id": 1, "name": "foo"}"##; -/// let mut writer = Cursor::new(Vec::new()); -/// let mut builder = DocumentBatchBuilder::new(&mut writer).unwrap(); -/// builder.extend_from_json(&mut json.as_bytes()).unwrap(); -/// builder.finish().unwrap(); +/// let json = json!({ "id": 1, "name": "foo" }); +/// +/// let mut builder = DocumentsBatchBuilder::new(Vec::new()); +/// builder.append_json_object(json.as_object().unwrap()).unwrap(); +/// let _vector = builder.into_inner().unwrap(); /// ``` -pub struct DocumentBatchBuilder { - inner: ByteCounter, - index: DocumentsBatchIndex, +pub struct DocumentsBatchBuilder { + /// The inner grenad writer, the last value must always be the `DocumentsBatchIndex`. + writer: grenad::Writer, + /// A map that creates the relation between field ids and field names. + fields_index: DocumentsBatchIndex, + /// The number of documents that were added to this builder, + /// it doesn't take the primary key of the documents into account at this point. + documents_count: u32, + + /// A buffer to store a temporary obkv buffer and avoid reallocating. obkv_buffer: Vec, + /// A buffer to serialize the values and avoid reallocating, + /// serialized values are stored in an obkv. value_buffer: Vec, - values: BTreeMap, - count: usize, } -impl DocumentBatchBuilder { - pub fn new(writer: W) -> Result { - let index = DocumentsBatchIndex::default(); - let mut writer = ByteCounter::new(writer); - // add space to write the offset of the metadata at the end of the writer - writer.write_u64::(0)?; - - Ok(Self { - inner: writer, - index, +impl DocumentsBatchBuilder { + pub fn new(writer: W) -> DocumentsBatchBuilder { + DocumentsBatchBuilder { + writer: WriterBuilder::new().compression_type(CompressionType::None).build(writer), + fields_index: DocumentsBatchIndex::default(), + documents_count: 0, obkv_buffer: Vec::new(), value_buffer: Vec::new(), - values: BTreeMap::new(), - count: 0, - }) + } } - /// Returns the number of documents that have been written to the builder. - pub fn len(&self) -> usize { - self.count + /// Returns the number of documents inserted into this builder. + pub fn documents_count(&self) -> u32 { + self.documents_count } - /// This method must be called after the document addition is terminated. It will put the - /// metadata at the end of the file, and write the metadata offset at the beginning on the - /// file. - pub fn finish(self) -> Result { - let Self { inner: ByteCounter { mut writer, count: offset }, index, count, .. } = self; + /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_object(&mut self, object: &Map) -> io::Result<()> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); + fields_ids.sort_unstable(); - let meta = DocumentsMetadata { count, index }; + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + for field_id in fields_ids { + let key = self.fields_index.name(field_id).unwrap(); + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &object[key])?; + writer.insert(field_id, &self.value_buffer)?; + } - bincode::serialize_into(&mut writer, &meta)?; + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; - writer.seek(io::SeekFrom::Start(0))?; - writer.write_u64::(offset as u64)?; - - writer.flush()?; - - Ok(count) + Ok(()) } - /// Extends the builder with json documents from a reader. - pub fn extend_from_json(&mut self, reader: R) -> Result<(), Error> { - let mut de = serde_json::Deserializer::from_reader(reader); - - let mut visitor = DocumentVisitor { - inner: &mut self.inner, - index: &mut self.index, - obkv_buffer: &mut self.obkv_buffer, - value_buffer: &mut self.value_buffer, - values: &mut self.values, - count: &mut self.count, - }; - - de.deserialize_any(&mut visitor).map_err(Error::JsonError)? - } - - /// Creates a builder from a reader of CSV documents. - /// - /// Since all fields in a csv documents are guaranteed to be ordered, we are able to perform - /// optimisations, and extending from another CSV is not allowed. - pub fn from_csv(reader: R, writer: W) -> Result { - let mut this = Self::new(writer)?; - // Ensure that this is the first and only addition made with this builder - debug_assert!(this.index.is_empty()); - - let mut records = csv::Reader::from_reader(reader); - - let headers = records + /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_csv(&mut self, mut reader: csv::Reader) -> Result<(), Error> { + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. + let mut typed_fields_ids: Vec<_> = reader .headers()? .into_iter() .map(parse_csv_header) - .map(|(k, t)| (this.index.insert(k), t)) - .collect::>(); + .map(|(k, t)| (self.fields_index.insert(k), t)) + .enumerate() + .collect(); + typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); - for (i, record) in records.into_records().enumerate() { - let record = record?; - this.obkv_buffer.clear(); - let mut writer = obkv::KvWriter::new(&mut this.obkv_buffer); - for (value, (fid, ty)) in record.into_iter().zip(headers.iter()) { - let value = match ty { + let mut record = csv::StringRecord::new(); + let mut line = 0; + while reader.read_record(&mut record)? { + // We increment here and not at the end of the while loop to take + // the header offset into account. + line += 1; + + self.obkv_buffer.clear(); + let mut writer = obkv::KvWriter::new(&mut self.obkv_buffer); + + for (i, (field_id, type_)) in typed_fields_ids.iter() { + self.value_buffer.clear(); + + let value = &record[*i]; + match type_ { AllowedType::Number => { if value.trim().is_empty() { - Value::Null + to_writer(&mut self.value_buffer, &Value::Null)?; } else { - value.trim().parse::().map(Value::from).map_err(|error| { - Error::ParseFloat { - error, - // +1 for the header offset. - line: i + 1, - value: value.to_string(), + match value.trim().parse::() { + Ok(float) => { + to_writer(&mut self.value_buffer, &float)?; } - })? + Err(error) => { + return Err(Error::ParseFloat { + error, + line, + value: value.to_string(), + }); + } + } } } AllowedType::String => { if value.is_empty() { - Value::Null + to_writer(&mut self.value_buffer, &Value::Null)?; } else { - Value::String(value.to_string()) + to_writer(&mut self.value_buffer, value)?; } } - }; + } - this.value_buffer.clear(); - serde_json::to_writer(Cursor::new(&mut this.value_buffer), &value)?; - writer.insert(*fid, &this.value_buffer)?; + // We insert into the obkv writer the value buffer that has been filled just above. + writer.insert(*field_id, &self.value_buffer)?; } - this.inner.write_u32::(this.obkv_buffer.len() as u32)?; - this.inner.write_all(&this.obkv_buffer)?; - - this.count += 1; + let internal_id = self.documents_count.to_be_bytes(); + let document_bytes = writer.into_inner()?; + self.writer.insert(internal_id, &document_bytes)?; + self.documents_count += 1; } - Ok(this) + Ok(()) + } + + /// Flushes the content on disk and stores the final version of the `DocumentsBatchIndex`. + pub fn into_inner(mut self) -> io::Result { + let DocumentsBatchBuilder { mut writer, fields_index, .. } = self; + + // We serialize and insert the `DocumentsBatchIndex` as the last key of the grenad writer. + self.value_buffer.clear(); + to_writer(&mut self.value_buffer, &fields_index)?; + writer.insert(DOCUMENTS_BATCH_INDEX_KEY, &self.value_buffer)?; + + writer.into_inner() } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 09f15901d..bd0afc6e4 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,24 +1,22 @@ mod builder; -/// The documents module defines an intermediary document format that milli uses for indexation, and -/// provides an API to easily build and read such documents. -/// -/// The `DocumentBatchBuilder` interface allows to write batches of documents to a writer, that can -/// later be read by milli using the `DocumentBatchReader` interface. mod reader; -mod serde_impl; use std::fmt::{self, Debug}; use std::io; use bimap::BiHashMap; -pub use builder::DocumentBatchBuilder; -pub use reader::DocumentBatchReader; +pub use builder::DocumentsBatchBuilder; +pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; use crate::FieldId; +/// The key that is used to store the `DocumentsBatchIndex` datastructure, +/// it is the absolute last key of the list. +const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); + /// A bidirectional map that links field ids to their name in a document batch. -#[derive(Default, Debug, Serialize, Deserialize)] +#[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct DocumentsBatchIndex(pub BiHashMap); impl DocumentsBatchIndex { @@ -46,8 +44,8 @@ impl DocumentsBatchIndex { self.0.iter() } - pub fn name(&self, id: FieldId) -> Option<&String> { - self.0.get_by_left(&id) + pub fn name(&self, id: FieldId) -> Option<&str> { + self.0.get_by_left(&id).map(AsRef::as_ref) } pub fn recreate_json( @@ -69,50 +67,20 @@ impl DocumentsBatchIndex { } } -#[derive(Debug, Serialize, Deserialize)] -struct DocumentsMetadata { - count: usize, - index: DocumentsBatchIndex, -} - -pub struct ByteCounter { - count: usize, - writer: W, -} - -impl ByteCounter { - fn new(writer: W) -> Self { - Self { count: 0, writer } - } -} - -impl io::Write for ByteCounter { - fn write(&mut self, buf: &[u8]) -> io::Result { - let count = self.writer.write(buf)?; - self.count += count; - Ok(count) - } - - fn flush(&mut self) -> io::Result<()> { - self.writer.flush() - } -} - #[derive(Debug)] pub enum Error { ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, - Custom(String), - JsonError(serde_json::Error), - CsvError(csv::Error), - Serialize(bincode::Error), + Csv(csv::Error), + Json(serde_json::Error), + Serialize(serde_json::Error), + Grenad(grenad::Error), Io(io::Error), - DocumentTooLarge, } impl From for Error { fn from(e: csv::Error) -> Self { - Self::CsvError(e) + Self::Csv(e) } } @@ -122,15 +90,15 @@ impl From for Error { } } -impl From for Error { - fn from(other: bincode::Error) -> Self { - Self::Serialize(other) +impl From for Error { + fn from(other: serde_json::Error) -> Self { + Self::Json(other) } } -impl From for Error { - fn from(other: serde_json::Error) -> Self { - Self::JsonError(other) +impl From for Error { + fn from(other: grenad::Error) -> Self { + Self::Grenad(other) } } @@ -140,13 +108,14 @@ impl fmt::Display for Error { Error::ParseFloat { error, line, value } => { write!(f, "Error parsing number {:?} at line {}: {}", value, line, error) } - Error::Custom(s) => write!(f, "Unexpected serialization error: {}", s), - Error::InvalidDocumentFormat => f.write_str("Invalid document addition format."), - Error::JsonError(err) => write!(f, "Couldn't serialize document value: {}", err), + Error::InvalidDocumentFormat => { + f.write_str("Invalid document addition format, missing the documents batch index.") + } Error::Io(e) => write!(f, "{}", e), - Error::DocumentTooLarge => f.write_str("Provided document is too large (>2Gib)"), Error::Serialize(e) => write!(f, "{}", e), - Error::CsvError(e) => write!(f, "{}", e), + Error::Grenad(e) => write!(f, "{}", e), + Error::Csv(e) => write!(f, "{}", e), + Error::Json(e) => write!(f, "{}", e), } } } @@ -158,15 +127,25 @@ impl std::error::Error for Error {} macro_rules! documents { ($data:tt) => {{ let documents = serde_json::json!($data); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); + let documents = match documents { + object @ serde_json::Value::Object(_) => vec![object], + serde_json::Value::Array(objects) => objects, + invalid => { + panic!("an array of objects must be specified, {:#?} is not an array", invalid) + } + }; - writer.set_position(0); + let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); + for document in documents { + let object = match document { + serde_json::Value::Object(object) => object, + invalid => panic!("an object must be specified, {:#?} is not an object", invalid), + }; + builder.append_json_object(&object).unwrap(); + } - crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + let vector = builder.into_inner().unwrap(); + crate::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap() }}; } diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 14d7c8ceb..3dff999f5 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,11 +1,9 @@ +use std::convert::TryInto; use std::io; -use std::io::{BufReader, Read}; -use std::mem::size_of; -use byteorder::{BigEndian, ReadBytesExt}; use obkv::KvReader; -use super::{DocumentsBatchIndex, DocumentsMetadata, Error}; +use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; use crate::FieldId; /// The `DocumentsBatchReader` provides a way to iterate over documents that have been created with @@ -13,63 +11,80 @@ use crate::FieldId; /// /// The documents are returned in the form of `obkv::Reader` where each field is identified with a /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. -pub struct DocumentBatchReader { - reader: BufReader, - metadata: DocumentsMetadata, - buffer: Vec, - seen_documents: usize, +pub struct DocumentsBatchReader { + cursor: grenad::ReaderCursor, + fields_index: DocumentsBatchIndex, } -impl DocumentBatchReader { +impl DocumentsBatchReader { /// Construct a `DocumentsReader` from a reader. /// - /// It first retrieves the index, then moves to the first document. Subsequent calls to - /// `next_document` advance the document reader until all the documents have been read. - pub fn from_reader(mut reader: R) -> Result { - let mut buffer = Vec::new(); + /// It first retrieves the index, then moves to the first document. Use the `into_cursor` + /// method to iterator over the documents, from the first to the last. + pub fn from_reader(reader: R) -> Result { + let reader = grenad::Reader::new(reader)?; + let mut cursor = reader.into_cursor()?; - let meta_offset = reader.read_u64::()?; - reader.seek(io::SeekFrom::Start(meta_offset))?; - reader.read_to_end(&mut buffer)?; - let metadata: DocumentsMetadata = bincode::deserialize(&buffer)?; + let fields_index = match cursor.move_on_key_equal_to(DOCUMENTS_BATCH_INDEX_KEY)? { + Some((_, value)) => serde_json::from_slice(value).map_err(Error::Serialize)?, + None => return Err(Error::InvalidDocumentFormat), + }; - reader.seek(io::SeekFrom::Start(size_of::() as u64))?; - buffer.clear(); - - let reader = BufReader::new(reader); - - Ok(Self { reader, metadata, buffer, seen_documents: 0 }) + Ok(DocumentsBatchReader { cursor, fields_index }) } - /// Returns the next document in the reader, and wraps it in an `obkv::KvReader`, along with a - /// reference to the addition index. - pub fn next_document_with_index<'a>( - &'a mut self, - ) -> io::Result)>> { - if self.seen_documents < self.metadata.count { - let doc_len = self.reader.read_u32::()?; - self.buffer.resize(doc_len as usize, 0); - self.reader.read_exact(&mut self.buffer)?; - self.seen_documents += 1; - - let reader = KvReader::new(&self.buffer); - Ok(Some((&self.metadata.index, reader))) - } else { - Ok(None) - } - } - - /// Return the fields index for the documents batch. - pub fn index(&self) -> &DocumentsBatchIndex { - &self.metadata.index - } - - /// Returns the number of documents in the reader. - pub fn len(&self) -> usize { - self.metadata.count + pub fn documents_count(&self) -> u32 { + self.cursor.len().saturating_sub(1).try_into().expect("Invalid number of documents") } pub fn is_empty(&self) -> bool { - self.len() == 0 + self.cursor.len().saturating_sub(1) == 0 + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + &self.fields_index + } + + /// This method returns a forward cursor over the documents. + pub fn into_cursor(self) -> DocumentsBatchCursor { + let DocumentsBatchReader { cursor, fields_index } = self; + let mut cursor = DocumentsBatchCursor { cursor, fields_index }; + cursor.reset(); + cursor + } +} + +/// A forward cursor over the documents in a `DocumentsBatchReader`. +pub struct DocumentsBatchCursor { + cursor: grenad::ReaderCursor, + fields_index: DocumentsBatchIndex, +} + +impl DocumentsBatchCursor { + pub fn into_reader(self) -> DocumentsBatchReader { + let DocumentsBatchCursor { cursor, fields_index, .. } = self; + DocumentsBatchReader { cursor, fields_index } + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + &self.fields_index + } + + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.cursor.reset(); + } +} + +impl DocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_document(&mut self) -> Result>, grenad::Error> { + match self.cursor.move_on_next()? { + Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { + Ok(Some(KvReader::new(value))) + } + _otherwise => Ok(None), + } } } diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs deleted file mode 100644 index d57bf1ffb..000000000 --- a/milli/src/documents/serde_impl.rs +++ /dev/null @@ -1,134 +0,0 @@ -use std::collections::BTreeMap; -use std::fmt; -use std::io::{Cursor, Write}; - -use byteorder::WriteBytesExt; -use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; -use serde::Deserialize; -use serde_json::Value; - -use super::{ByteCounter, DocumentsBatchIndex, Error}; -use crate::FieldId; - -macro_rules! tri { - ($e:expr) => { - match $e { - Ok(r) => r, - Err(e) => return Ok(Err(e.into())), - } - }; -} - -struct FieldIdResolver<'a>(&'a mut DocumentsBatchIndex); - -impl<'a, 'de> DeserializeSeed<'de> for FieldIdResolver<'a> { - type Value = FieldId; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_str(self) - } -} - -impl<'a, 'de> Visitor<'de> for FieldIdResolver<'a> { - type Value = FieldId; - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - Ok(self.0.insert(v)) - } - - fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "a string") - } -} - -struct ValueDeserializer; - -impl<'de> DeserializeSeed<'de> for ValueDeserializer { - type Value = serde_json::Value; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - serde_json::Value::deserialize(deserializer) - } -} - -pub struct DocumentVisitor<'a, W> { - pub inner: &'a mut ByteCounter, - pub index: &'a mut DocumentsBatchIndex, - pub obkv_buffer: &'a mut Vec, - pub value_buffer: &'a mut Vec, - pub values: &'a mut BTreeMap, - pub count: &'a mut usize, -} - -impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { - /// This Visitor value is nothing, since it write the value to a file. - type Value = Result<(), Error>; - - fn visit_seq(self, mut seq: A) -> Result - where - A: SeqAccess<'de>, - { - while let Some(v) = seq.next_element_seed(&mut *self)? { - tri!(v) - } - - Ok(Ok(())) - } - - fn visit_map(self, mut map: A) -> Result - where - A: MapAccess<'de>, - { - while let Some((key, value)) = - map.next_entry_seed(FieldIdResolver(&mut *self.index), ValueDeserializer)? - { - self.values.insert(key, value); - } - - self.obkv_buffer.clear(); - let mut obkv = obkv::KvWriter::new(Cursor::new(&mut *self.obkv_buffer)); - for (key, value) in self.values.iter() { - self.value_buffer.clear(); - // This is guaranteed to work - tri!(serde_json::to_writer(Cursor::new(&mut *self.value_buffer), value)); - tri!(obkv.insert(*key, &self.value_buffer)); - } - - let reader = tri!(obkv.into_inner()).into_inner(); - - tri!(self.inner.write_u32::(reader.len() as u32)); - tri!(self.inner.write_all(reader)); - - *self.count += 1; - self.values.clear(); - - Ok(Ok(())) - } - - fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "a documents, or a sequence of documents.") - } -} - -impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> -where - W: Write, -{ - type Value = Result<(), Error>; - - fn deserialize(self, deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - deserializer.deserialize_map(self) - } -} From e8297ad27e4977f4f43c43f181dc9d7c9ea041dd Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:04:27 +0200 Subject: [PATCH 04/35] Fix the tests for the new DocumentsBatchBuilder/Reader --- milli/src/documents/builder.rs | 332 +++++++++--------- milli/src/documents/mod.rs | 133 ++----- milli/src/search/distinct/mod.rs | 31 +- milli/src/update/index_documents/mod.rs | 63 ++-- milli/src/update/index_documents/transform.rs | 12 +- milli/tests/search/facet_distribution.rs | 26 +- milli/tests/search/mod.rs | 18 +- milli/tests/search/query_criteria.rs | 20 +- milli/tests/search/typo_tolerance.rs | 31 +- 9 files changed, 292 insertions(+), 374 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 159afb8d9..19cc1ce53 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -183,7 +183,8 @@ mod test { use serde_json::{json, Map}; use super::*; - use crate::documents::DocumentBatchReader; + use crate::documents::DocumentsBatchReader; + use crate::FieldId; fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { let mut map = Map::new(); @@ -192,7 +193,7 @@ mod test { let field_name = index.name(fid).unwrap().clone(); let value: Value = serde_json::from_slice(value).unwrap(); - map.insert(field_name, value); + map.insert(field_name.to_string(), value); } Value::Object(map) @@ -200,15 +201,13 @@ mod test { #[test] fn add_single_documents_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let json = serde_json::json!({ "id": 1, "field": "hello!", }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(json.as_object().unwrap()).unwrap(); let json = serde_json::json!({ "blabla": false, @@ -216,100 +215,64 @@ mod test { "id": 1, }); - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); + builder.append_json_object(json.as_object().unwrap()).unwrap(); - assert_eq!(builder.len(), 2); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 3); + + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 3); - assert!(reader.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_seq_json() { - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let json = serde_json::json!([{ - "id": 1, - "field": "hello!", - },{ - "blabla": false, - "field": "hello!", - "id": 1, - } - ]); - - builder.extend_from_json(Cursor::new(serde_json::to_vec(&json).unwrap())).unwrap(); - - assert_eq!(builder.len(), 2); - - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 2); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); - assert_eq!(index.len(), 3); - assert_eq!(document.iter().count(), 3); - - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn add_documents_csv() { - let mut cursor = Cursor::new(Vec::new()); + let csv_content = "id:number,field:string\n1,hello!\n2,blabla"; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let csv = "id:number,field:string\n1,hello!\n2,blabla"; + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + assert_eq!(builder.documents_count(), 2); + let vector = builder.into_inner().unwrap(); - let builder = - DocumentBatchBuilder::from_csv(Cursor::new(csv.as_bytes()), &mut cursor).unwrap(); - builder.finish().unwrap(); - - cursor.set_position(0); - - let mut reader = DocumentBatchReader::from_reader(cursor).unwrap(); - - let (index, document) = reader.next_document_with_index().unwrap().unwrap(); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); assert_eq!(index.len(), 2); + + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - let (_index, document) = reader.next_document_with_index().unwrap().unwrap(); + let document = cursor.next_document().unwrap().unwrap(); assert_eq!(document.iter().count(), 2); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn simple_csv_document() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -320,22 +283,25 @@ mod test { }) ); - assert!(reader.next_document_with_index().unwrap().is_none()); + assert!(cursor.next_document().unwrap().is_none()); } #[test] fn coma_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United, States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -349,17 +315,20 @@ mod test { #[test] fn quote_in_field() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United"" States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -373,17 +342,20 @@ mod test { #[test] fn integer_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -397,17 +369,20 @@ mod test { #[test] fn float_in_field() { - let documents = r#"city,country,pop:number + let csv_content = r#"city,country,pop:number "Boston","United States","4628910.01""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -421,17 +396,20 @@ mod test { #[test] fn several_colon_in_header() { - let documents = r#"city:love:string,country:state,pop + let csv_content = r#"city:love:string,country:state,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -445,17 +423,20 @@ mod test { #[test] fn ending_by_colon_in_header() { - let documents = r#"city:,country,pop + let csv_content = r#"city:,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -469,17 +450,20 @@ mod test { #[test] fn starting_by_colon_in_header() { - let documents = r#":city,country,pop + let csv_content = r#":city,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -494,32 +478,36 @@ mod test { #[ignore] #[test] fn starting_by_colon_in_header2() { - let documents = r#":string,country,pop + let csv_content = r#":string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); - assert!(reader.next_document_with_index().is_err()); + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + + assert!(cursor.next_document().is_err()); } #[test] fn double_colon_in_header() { - let documents = r#"city::string,country,pop + let csv_content = r#"city::string,country,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)) - .unwrap() - .finish() - .unwrap(); - let mut reader = DocumentBatchReader::from_reader(Cursor::new(buf)).unwrap(); - let (index, doc) = reader.next_document_with_index().unwrap().unwrap(); - let val = obkv_to_value(&doc, index); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv).unwrap(); + let vector = builder.into_inner().unwrap(); + + let mut cursor = + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let index = cursor.documents_batch_index().clone(); + + let doc = cursor.next_document().unwrap().unwrap(); + let val = obkv_to_value(&doc, &index); assert_eq!( val, @@ -533,34 +521,32 @@ mod test { #[test] fn bad_type_in_header() { - let documents = r#"city,country:number,pop + let csv_content = r#"city,country:number,pop "Boston","United States","4628910""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count1() { - let documents = r#"city,country,pop -"Boston","United States","4628910", "too much""#; + let csv_content = r#"city,country,pop +"Boston","United States","4628910", "too much + let csv = csv::Reader::from_reader(Cursor::new(csv_content"#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } #[test] fn bad_column_count2() { - let documents = r#"city,country,pop + let csv_content = r#"city,country,pop "Boston","United States""#; + let csv = csv::Reader::from_reader(Cursor::new(csv_content)); - let mut buf = Vec::new(); - assert!( - DocumentBatchBuilder::from_csv(documents.as_bytes(), Cursor::new(&mut buf)).is_err() - ); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + assert!(builder.append_csv(csv).is_err()); } } diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index bd0afc6e4..7a34ae13b 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -159,7 +159,7 @@ mod test { #[test] fn create_documents_no_errors() { - let json = json!({ + let value = json!({ "number": 1, "string": "this is a field", "array": ["an", "array"], @@ -169,26 +169,17 @@ mod test { "bool": true }); - let json = serde_json::to_vec(&json).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(json)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(value.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - assert_eq!(documents.index().iter().count(), 5); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 5); - assert!(documents.next_document_with_index().unwrap().is_none()); + assert_eq!(documents.documents_batch_index().iter().count(), 5); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 5); + assert!(documents.next_document().unwrap().is_none()); } #[test] @@ -200,101 +191,55 @@ mod test { "toto": false, }); - let doc1 = serde_json::to_vec(&doc1).unwrap(); - let doc2 = serde_json::to_vec(&doc2).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(doc1)).unwrap(); - builder.extend_from_json(Cursor::new(doc2)).unwrap(); - - builder.finish().unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_documents_array() { - let docs = json!([ - { "toto": false }, - { "tata": "hello" }, - ]); - - let docs = serde_json::to_vec(&docs).unwrap(); - - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - builder.extend_from_json(Cursor::new(docs)).unwrap(); - - builder.finish().unwrap(); - - let mut documents = - DocumentBatchReader::from_reader(io::Cursor::new(cursor.into_inner())).unwrap(); - - assert_eq!(documents.index().iter().count(), 2); - - let reader = documents.next_document_with_index().unwrap().unwrap(); - - assert_eq!(reader.1.iter().count(), 1); - assert!(documents.next_document_with_index().unwrap().is_some()); - assert!(documents.next_document_with_index().unwrap().is_none()); - } - - #[test] - fn add_invalid_document_format() { - let mut v = Vec::new(); - let mut cursor = io::Cursor::new(&mut v); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - - let docs = json!([[ - { "toto": false }, - { "tata": "hello" }, - ]]); - - let docs = serde_json::to_vec(&docs).unwrap(); - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); - - let docs = json!("hello"); - let docs = serde_json::to_vec(&docs).unwrap(); - - assert!(builder.extend_from_json(Cursor::new(docs)).is_err()); + DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor(); + assert_eq!(documents.documents_batch_index().iter().count(), 2); + let reader = documents.next_document().unwrap().unwrap(); + assert_eq!(reader.iter().count(), 1); + assert!(documents.next_document().unwrap().is_some()); + assert!(documents.next_document().unwrap().is_none()); } #[test] fn test_nested() { - let mut docs = documents!([{ + let docs_reader = documents!([{ "hello": { "toto": ["hello"] } }]); - let (_index, doc) = docs.next_document_with_index().unwrap().unwrap(); - + let mut cursor = docs_reader.into_cursor(); + let doc = cursor.next_document().unwrap().unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); } #[test] - fn out_of_order_fields() { + fn out_of_order_json_fields() { let _documents = documents!([ {"id": 1,"b": 0}, {"id": 2,"a": 0,"b": 0}, ]); } + + #[test] + fn out_of_order_csv_fields() { + let csv1_content = "id:number,b\n1,0"; + let csv1 = csv::Reader::from_reader(Cursor::new(csv1_content)); + + let csv2_content = "id:number,a,b\n2,0,0"; + let csv2 = csv::Reader::from_reader(Cursor::new(csv2_content)); + + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_csv(csv1).unwrap(); + builder.append_csv(csv2).unwrap(); + let vector = builder.into_inner().unwrap(); + + DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); + } } diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 237fd718a..670fa01ac 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -35,7 +35,7 @@ mod test { use roaring::RoaringBitmap; use serde_json::{json, Value}; - use crate::documents::{DocumentBatchBuilder, DocumentBatchReader}; + use crate::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use crate::index::tests::TempIndex; use crate::index::Index; use crate::update::{ @@ -43,14 +43,11 @@ mod test { }; use crate::{DocumentId, FieldId, BEU32}; - static JSON: Lazy> = Lazy::new(generate_documents); - - fn generate_documents() -> Vec { + static JSON: Lazy> = Lazy::new(|| { let mut rng = rand::thread_rng(); let num_docs = rng.gen_range(10..30); - let mut cursor = Cursor::new(Vec::new()); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); let txts = ["Toto", "Titi", "Tata"]; let cats = (1..10).map(|i| i.to_string()).collect::>(); let cat_ints = (1..10).collect::>(); @@ -63,7 +60,7 @@ mod test { let mut sample_ints = cat_ints.clone(); sample_ints.shuffle(&mut rng); - let doc = json!({ + let json = json!({ "id": i, "txt": txt, "cat-int": rng.gen_range(0..3), @@ -71,13 +68,16 @@ mod test { "cat-ints": sample_ints[..(rng.gen_range(0..3))], }); - let doc = Cursor::new(serde_json::to_vec(&doc).unwrap()); - builder.extend_from_json(doc).unwrap(); + let object = match json { + Value::Object(object) => object, + _ => panic!(), + }; + + builder.append_json_object(&object).unwrap(); } - builder.finish().unwrap(); - cursor.into_inner() - } + builder.into_inner().unwrap() + }); /// Returns a temporary index populated with random test documents, the FieldId for the /// distinct attribute, and the RoaringBitmap with the document ids. @@ -101,7 +101,8 @@ mod test { IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = - crate::documents::DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) + .unwrap(); addition.add_documents(reader).unwrap(); addition.execute().unwrap(); @@ -109,8 +110,8 @@ mod test { let fields_map = index.fields_ids_map(&txn).unwrap(); let fid = fields_map.id(&distinct).unwrap(); - let documents = DocumentBatchReader::from_reader(Cursor::new(&*JSON)).unwrap(); - let map = (0..documents.len() as u32).collect(); + let documents = DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())).unwrap(); + let map = (0..documents.documents_count() as u32).collect(); txn.commit().unwrap(); diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba428f078..7f6e00b11 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -25,7 +25,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::DocumentBatchReader; +use crate::documents::DocumentsBatchReader; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, @@ -121,7 +121,7 @@ where /// builder, and the builder must be discarded. /// /// Returns the number of documents added to the builder. - pub fn add_documents(&mut self, reader: DocumentBatchReader) -> Result + pub fn add_documents(&mut self, reader: DocumentsBatchReader) -> Result where R: Read + Seek, { @@ -590,9 +590,8 @@ mod tests { use maplit::hashset; use super::*; - use crate::documents::DocumentBatchBuilder; + use crate::documents::DocumentsBatchBuilder; use crate::update::DeleteDocuments; - use crate::HashMap; #[test] fn simple_document_replacement() { @@ -1252,21 +1251,17 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); for i in 0..1000 { let key = i.to_string(); - big_object.insert(key, "I am a text!"); + big_object.insert(key, serde_json::Value::from("I am a text!")); } - let mut cursor = Cursor::new(Vec::new()); - - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - let big_object = Cursor::new(serde_json::to_vec(&big_object).unwrap()); - builder.extend_from_json(big_object).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1288,23 +1283,19 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); - let mut big_object = HashMap::new(); - big_object.insert(S("id"), "wow"); + let mut big_object = serde_json::Map::new(); + big_object.insert(S("id"), serde_json::Value::from("wow")); let content: String = (0..=u16::MAX) .into_iter() .map(|p| p.to_string()) .reduce(|a, b| a + " " + b.as_ref()) .unwrap(); - big_object.insert("content".to_string(), &content); + big_object.insert("content".to_string(), serde_json::Value::from(content)); - let mut cursor = Cursor::new(Vec::new()); - - let big_object = serde_json::to_string(&big_object).unwrap(); - let mut builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); - builder.extend_from_json(&mut big_object.as_bytes()).unwrap(); - builder.finish().unwrap(); - cursor.set_position(0); - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let mut builder = DocumentsBatchBuilder::new(Vec::new()); + builder.append_json_object(&big_object).unwrap(); + let vector = builder.into_inner().unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); @@ -1843,18 +1834,20 @@ mod tests { // Create 200 documents with a long text let content = { - let documents: Vec<_> = (0..200i32) + let documents_iter = (0..200i32) .into_iter() .map(|i| serde_json::json!({ "id": i, "script": script })) - .collect(); + .filter_map(|json| match json { + serde_json::Value::Object(object) => Some(object), + _ => None, + }); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = crate::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); - writer.set_position(0); - crate::documents::DocumentBatchReader::from_reader(writer).unwrap() + let mut builder = crate::documents::DocumentsBatchBuilder::new(Vec::new()); + for object in documents_iter { + builder.append_json_object(&object).unwrap(); + } + let vector = builder.into_inner().unwrap(); + crate::documents::DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap() }; // Index those 200 long documents diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 12a858024..129357075 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,7 +14,7 @@ use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; -use crate::documents::{DocumentBatchReader, DocumentsBatchIndex}; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; @@ -152,7 +152,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn read_documents( &mut self, - mut reader: DocumentBatchReader, + reader: DocumentsBatchReader, wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result @@ -160,7 +160,8 @@ impl<'a, 'i> Transform<'a, 'i> { R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { - let fields_index = reader.index(); + let mut cursor = reader.into_cursor(); + let fields_index = cursor.documents_batch_index(); let external_documents_ids = self.index.external_documents_ids(wtxn)?; let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; @@ -186,7 +187,8 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); - while let Some((addition_index, document)) = reader.next_document_with_index()? { + let addition_index = cursor.documents_batch_index().clone(); + while let Some(document) = cursor.next_document()? { let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -840,7 +842,7 @@ fn update_primary_key<'a>( None => { let mut json = Map::new(); for (key, value) in document.iter() { - let key = addition_index.name(key).cloned(); + let key = addition_index.name(key).map(ToString::to_string); let value = serde_json::from_slice::(&value).ok(); if let Some((k, v)) = key.zip(value) { diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index d3aece2ab..66713de1e 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -3,9 +3,10 @@ use std::io::Cursor; use big_s::S; use heed::EnvOpenOptions; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{FacetDistribution, Index}; +use serde_json::{Deserializer, Map, Value}; #[test] fn test_facet_distribution_with_no_facet_values() { @@ -30,35 +31,30 @@ fn test_facet_distribution_with_no_facet_values() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( - r#"[ - { + r#"{ "id": 123, "title": "What a week, hu...", "genres": [], "tags": ["blue"] - }, + } { "id": 345, "title": "I am the pig!", "tags": ["red"] - } - ]"#, + }"#, ); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 472fbafe0..4cf117dc7 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -6,10 +6,11 @@ use big_s::S; use either::{Either, Left, Right}; use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, DocumentId, Index, Member}; use serde::Deserialize; +use serde_json::{Deserializer, Map, Value}; use slice_group_by::GroupBy; mod distinct; @@ -62,21 +63,18 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut documents_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); - for doc in serde_json::Deserializer::from_reader(reader).into_iter::() { - let doc = Cursor::new(serde_json::to_vec(&doc.unwrap()).unwrap()); - documents_builder.extend_from_json(doc).unwrap(); + for result in Deserializer::from_reader(reader).into_iter::>() { + let object = result.unwrap(); + documents_builder.append_json_object(&object).unwrap(); } - documents_builder.finish().unwrap(); - - cursor.set_position(0); + let vector = documents_builder.into_inner().unwrap(); // index documents - let content = DocumentBatchReader::from_reader(cursor).unwrap(); + let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(content).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 893d7c30a..89a6a6eec 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -5,7 +5,7 @@ use big_s::S; use heed::EnvOpenOptions; use itertools::Itertools; use maplit::hashset; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::{AscDesc, Criterion, Index, Member, Search, SearchResult}; use rand::Rng; @@ -393,8 +393,7 @@ fn criteria_ascdesc() { let mut builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - let mut cursor = Cursor::new(Vec::new()); - let mut batch_builder = DocumentBatchBuilder::new(&mut cursor).unwrap(); + let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); (0..ASC_DESC_CANDIDATES_THRESHOLD + 1).for_each(|_| { let mut rng = rand::thread_rng(); @@ -412,16 +411,17 @@ fn criteria_ascdesc() { "age": age, }); - let json = Cursor::new(serde_json::to_vec(&json).unwrap()); - batch_builder.extend_from_json(json).unwrap(); + let object = match json { + serde_json::Value::Object(object) => object, + _ => panic!(), + }; + + batch_builder.append_json_object(&object).unwrap(); }); - batch_builder.finish().unwrap(); - - cursor.set_position(0); - - let reader = DocumentBatchReader::from_reader(cursor).unwrap(); + let vector = batch_builder.into_inner().unwrap(); + let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); builder.add_documents(reader).unwrap(); builder.execute().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 9a7986c5e..63bf22579 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -106,26 +106,23 @@ fn test_typo_disabled_on_word() { options.map_size(4096 * 100); let index = Index::new(options, tmp.path()).unwrap(); - let documents = json!([ - { - "id": 1usize, - "data": "zealand", - }, - { - "id": 2usize, - "data": "zearand", - }, - ]); + let mut builder = milli::documents::DocumentsBatchBuilder::new(Vec::new()); + let doc1 = json!({ + "id": 1usize, + "data": "zealand", + }); - let mut writer = std::io::Cursor::new(Vec::new()); - let mut builder = milli::documents::DocumentBatchBuilder::new(&mut writer).unwrap(); - let documents = serde_json::to_vec(&documents).unwrap(); - builder.extend_from_json(std::io::Cursor::new(documents)).unwrap(); - builder.finish().unwrap(); + let doc2 = json!({ + "id": 2usize, + "data": "zearand", + }); - writer.set_position(0); + builder.append_json_object(doc1.as_object().unwrap()).unwrap(); + builder.append_json_object(doc2.as_object().unwrap()).unwrap(); + let vector = builder.into_inner().unwrap(); - let documents = milli::documents::DocumentBatchReader::from_reader(writer).unwrap(); + let documents = + milli::documents::DocumentsBatchReader::from_reader(std::io::Cursor::new(vector)).unwrap(); let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); From 6d0498df2445975d8cfff3cb8c4b1453bd5e00e6 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:22:59 +0200 Subject: [PATCH 05/35] Fix the fuzz tests --- milli/fuzz/fuzz_targets/indexing.rs | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index b618aabad..5c3b79ed7 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -7,10 +7,10 @@ use anyhow::{bail, Result}; use arbitrary_json::ArbitraryValue; use heed::EnvOpenOptions; use libfuzzer_sys::fuzz_target; -use milli::documents::{DocumentBatchBuilder, DocumentBatchReader}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; use milli::Index; -use serde_json::Value; +use serde_json::{Map, Value}; #[cfg(target_os = "linux")] #[global_allocator] @@ -19,21 +19,26 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; /// reads json from input and write an obkv batch to writer. pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { let writer = BufWriter::new(writer); - let mut builder = DocumentBatchBuilder::new(writer)?; - builder.extend_from_json(input)?; + let mut builder = DocumentsBatchBuilder::new(writer); - if builder.len() == 0 { + let values: Vec> = serde_json::from_reader(input)?; + if builder.documents_count() == 0 { bail!("Empty payload"); } - let count = builder.finish()?; + for object in values { + builder.append_json_object(&object)?; + } - Ok(count) + let count = builder.documents_count(); + let vector = builder.into_inner()?; + + Ok(count as usize) } fn index_documents( index: &mut milli::Index, - documents: DocumentBatchReader>>, + documents: DocumentsBatchReader>>, ) -> Result<()> { let config = IndexerConfig::default(); let mut wtxn = index.write_txn()?; @@ -98,7 +103,7 @@ fuzz_target!(|batches: Vec>| { // We ignore all malformed documents if let Ok(_) = read_json(json.as_bytes(), &mut documents) { documents.rewind().unwrap(); - let documents = DocumentBatchReader::from_reader(documents).unwrap(); + let documents = DocumentsBatchReader::from_reader(documents).unwrap(); // A lot of errors can come out of milli and we don't know which ones are normal or not // so we are only going to look for the unexpected panics. let _ = index_documents(&mut index, documents); From a4ceef96246e8e121007c16a0d159cd4c99ede11 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 16:35:59 +0200 Subject: [PATCH 06/35] Fix the cli for the new DocumentsBatchBuilder/Reader structs --- cli/src/main.rs | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/cli/src/main.rs b/cli/src/main.rs index 14bc797af..dcd0f407a 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -8,6 +8,7 @@ use std::time::Instant; use byte_unit::Byte; use eyre::Result; use indicatif::{MultiProgress, ProgressBar, ProgressStyle}; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; @@ -225,9 +226,9 @@ impl Performer for DocumentAddition { DocumentAdditionFormat::Jsonl => documents_from_jsonl(reader)?, }; - let reader = milli::documents::DocumentBatchReader::from_reader(Cursor::new(documents))?; + let reader = DocumentsBatchReader::from_reader(Cursor::new(documents))?; - println!("Adding {} documents to the index.", reader.len()); + println!("Adding {} documents to the index.", reader.documents_count()); let mut txn = index.write_txn()?; let config = milli::update::IndexerConfig { log_every_n: Some(100), ..Default::default() }; @@ -321,35 +322,35 @@ fn indexing_callback(step: milli::update::UpdateIndexingStep, bars: &[ProgressBa } fn documents_from_jsonl(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let reader = BufReader::new(reader); - let mut buf = String::new(); - let mut reader = BufReader::new(reader); - - while reader.read_line(&mut buf)? > 0 { - documents.extend_from_json(&mut buf.as_bytes())?; + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_json(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_csv(reader: impl Read) -> Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; + let csv = csv::Reader::from_reader(reader); - Ok(writer.into_inner()) + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) } #[derive(Debug, StructOpt)] From f29114f94aff42592d8f0a932784f578433e0744 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 17:21:54 +0200 Subject: [PATCH 07/35] Fix http-ui to fit with the new DocumentsBatchBuilder/Reader structs --- http-ui/src/main.rs | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index ce4fa7ba5..63b9ee5e0 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -3,7 +3,7 @@ mod update_store; use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; use std::fmt::Display; use std::fs::{create_dir_all, File}; -use std::io::{BufRead, BufReader, Cursor, Read}; +use std::io::{BufReader, Cursor, Read}; use std::net::SocketAddr; use std::num::{NonZeroU32, NonZeroUsize}; use std::path::PathBuf; @@ -18,7 +18,7 @@ use either::Either; use flate2::read::GzDecoder; use futures::{stream, FutureExt, StreamExt}; use heed::EnvOpenOptions; -use milli::documents::DocumentBatchReader; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::tokenizer::TokenizerBuilder; use milli::update::UpdateIndexingStep::*; use milli::update::{ @@ -399,7 +399,7 @@ async fn main() -> anyhow::Result<()> { otherwise => panic!("invalid update format {:?}", otherwise), }; - let documents = DocumentBatchReader::from_reader(Cursor::new(documents))?; + let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; builder.add_documents(documents)?; @@ -1032,35 +1032,36 @@ async fn main() -> anyhow::Result<()> { Ok(()) } -fn documents_from_jsonl(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let reader = BufReader::new(reader); - for result in BufReader::new(reader).lines() { - let line = result?; - documents.extend_from_json(Cursor::new(line))?; + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } -fn documents_from_json(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_json(reader: impl Read) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } -fn documents_from_csv(reader: impl io::Read) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; +fn documents_from_csv(reader: impl Read) -> anyhow::Result> { + let csv = csv::Reader::from_reader(reader); - Ok(writer.into_inner()) + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) } #[cfg(test)] From a97d4d63b9f1f29edac7f78aa33710682996e63c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:17:48 +0200 Subject: [PATCH 08/35] Fix the benchmarks --- benchmarks/benches/utils.rs | 41 ++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index b2a9966a2..091b9b0f5 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,13 +1,13 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; +use std::io::{self, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; use criterion::BenchmarkId; use heed::EnvOpenOptions; -use milli::documents::DocumentBatchReader; +use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; @@ -150,39 +150,38 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader documents_from_jsonl(reader).unwrap(), otherwise => panic!("invalid update format {:?}", otherwise), }; - DocumentBatchReader::from_reader(Cursor::new(documents)).unwrap() + DocumentsBatchReader::from_reader(Cursor::new(documents)).unwrap() } -fn documents_from_jsonl(mut reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; +fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { + let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let mut buf = String::new(); - - while reader.read_line(&mut buf)? > 0 { - documents.extend_from_json(&mut buf.as_bytes())?; - buf.clear(); + for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + let object = result?; + documents.append_json_object(&object)?; } - documents.finish()?; - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - let mut documents = milli::documents::DocumentBatchBuilder::new(&mut writer)?; + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + let list: Vec> = serde_json::from_reader(reader)?; - documents.extend_from_json(reader)?; - documents.finish()?; + for object in list { + documents.append_json_object(&object)?; + } - Ok(writer.into_inner()) + documents.into_inner().map_err(Into::into) } fn documents_from_csv(reader: impl BufRead) -> anyhow::Result> { - let mut writer = Cursor::new(Vec::new()); - milli::documents::DocumentBatchBuilder::from_csv(reader, &mut writer)?.finish()?; + let csv = csv::Reader::from_reader(reader); - Ok(writer.into_inner()) + let mut documents = DocumentsBatchBuilder::new(Vec::new()); + documents.append_csv(csv)?; + + documents.into_inner().map_err(Into::into) } enum AllowedType { From bdc426388379e80e03d8291d3a12b2a6361f0571 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:12:15 +0200 Subject: [PATCH 09/35] Introduce the validate_documents_batch function --- milli/src/update/index_documents/transform.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 129357075..42187fc1e 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -13,7 +13,7 @@ use serde_json::{Map, Value}; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; -use super::{IndexDocumentsMethod, IndexerConfig}; +use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; From cefffde9af704bc9b4146206a852972745941eda Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:07:28 +0200 Subject: [PATCH 10/35] Improve the .gitignore of the fuzz crate --- milli/fuzz/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/milli/fuzz/.gitignore b/milli/fuzz/.gitignore index cb73742e4..ebf2c9395 100644 --- a/milli/fuzz/.gitignore +++ b/milli/fuzz/.gitignore @@ -1,2 +1,5 @@ +Cargo.lock +target/ + /corpus/ /artifacts/ From 0146175fe60e104519ebd23fa2e14bd1ff3e8bfc Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 14 Jun 2022 18:12:15 +0200 Subject: [PATCH 11/35] Introduce the validate_documents_batch function --- milli/src/documents/builder.rs | 36 ++--- milli/src/documents/mod.rs | 28 +++- milli/src/error.rs | 6 + .../extract/extract_geo_points.rs | 9 +- milli/src/update/index_documents/mod.rs | 36 +++-- milli/src/update/index_documents/transform.rs | 26 +--- milli/src/update/index_documents/validate.rs | 140 ++++++++++++++++++ 7 files changed, 208 insertions(+), 73 deletions(-) create mode 100644 milli/src/update/index_documents/validate.rs diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 19cc1ce53..15a22090a 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -180,24 +180,10 @@ fn parse_csv_header(header: &str) -> (&str, AllowedType) { mod test { use std::io::Cursor; - use serde_json::{json, Map}; + use serde_json::json; use super::*; - use crate::documents::DocumentsBatchReader; - use crate::FieldId; - - fn obkv_to_value(obkv: &obkv::KvReader, index: &DocumentsBatchIndex) -> Value { - let mut map = Map::new(); - - for (fid, value) in obkv.iter() { - let field_name = index.name(fid).unwrap().clone(); - let value: Value = serde_json::from_slice(value).unwrap(); - - map.insert(field_name.to_string(), value); - } - - Value::Object(map) - } + use crate::documents::{obkv_to_object, DocumentsBatchReader}; #[test] fn add_single_documents_json() { @@ -272,7 +258,7 @@ mod test { DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -301,7 +287,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -328,7 +314,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -355,7 +341,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -382,7 +368,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -409,7 +395,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -436,7 +422,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -463,7 +449,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, @@ -507,7 +493,7 @@ mod test { let index = cursor.documents_batch_index().clone(); let doc = cursor.next_document().unwrap().unwrap(); - let val = obkv_to_value(&doc, &index); + let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); assert_eq!( val, diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 7a34ae13b..ee3593bf8 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -6,15 +6,30 @@ use std::io; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; +use obkv::KvReader; pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; -use crate::FieldId; +use crate::error::{FieldIdMapMissingEntry, InternalError}; +use crate::{FieldId, Object, Result}; /// The key that is used to store the `DocumentsBatchIndex` datastructure, /// it is the absolute last key of the list. const DOCUMENTS_BATCH_INDEX_KEY: [u8; 8] = u64::MAX.to_be_bytes(); +/// Helper function to convert an obkv reader into a JSON object. +pub fn obkv_to_object(obkv: &KvReader, index: &DocumentsBatchIndex) -> Result { + obkv.iter() + .map(|(field_id, value)| { + let field_name = index.name(field_id).ok_or_else(|| { + FieldIdMapMissingEntry::FieldId { field_id, process: "obkv_to_object" } + })?; + let value = serde_json::from_slice(value).map_err(InternalError::SerdeJson)?; + Ok((field_name.to_string(), value)) + }) + .collect() +} + /// A bidirectional map that links field ids to their name in a document batch. #[derive(Default, Clone, Debug, Serialize, Deserialize)] pub struct DocumentsBatchIndex(pub BiHashMap); @@ -48,11 +63,12 @@ impl DocumentsBatchIndex { self.0.get_by_left(&id).map(AsRef::as_ref) } - pub fn recreate_json( - &self, - document: &obkv::KvReaderU16, - ) -> Result, crate::Error> { - let mut map = serde_json::Map::new(); + pub fn id(&self, name: &str) -> Option { + self.0.get_by_right(name).cloned() + } + + pub fn recreate_json(&self, document: &obkv::KvReaderU16) -> Result { + let mut map = Object::new(); for (k, v) in document.iter() { // TODO: TAMO: update the error type diff --git a/milli/src/error.rs b/milli/src/error.rs index 57ae1c85a..d34130210 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -141,10 +141,16 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco #[derive(Error, Debug)] pub enum GeoError { + #[error("The `_geo` field in the document with the id: `{document_id}` is not an object. Was expecting an object with the `_geo.lat` and `_geo.lng` fields but instead got `{value}`.")] + NotAnObject { document_id: Value, value: Value }, + #[error("Could not find latitude nor longitude in the document with the id: `{document_id}`. Was expecting `_geo.lat` and `_geo.lng` fields.")] + MissingLatitudeAndLongitude { document_id: Value }, #[error("Could not find latitude in the document with the id: `{document_id}`. Was expecting a `_geo.lat` field.")] MissingLatitude { document_id: Value }, #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] MissingLongitude { document_id: Value }, + #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")] + BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] BadLatitude { document_id: Value, value: Value }, #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index fffae5e77..0f804b93b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -7,6 +7,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; +use crate::update::index_documents::extract_float_from_value; use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. @@ -61,11 +62,3 @@ pub fn extract_geo_points( Ok(writer_into_reader(writer)?) } - -fn extract_float_from_value(value: Value) -> StdResult { - match value { - Value::Number(ref n) => n.as_f64().ok_or(value), - Value::String(ref s) => s.parse::().map_err(|_| value), - value => Err(value), - } -} diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 7f6e00b11..2fb7cbcd9 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -2,11 +2,13 @@ mod extract; mod helpers; mod transform; mod typed_chunk; +mod validate; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; use std::iter::FromIterator; use std::num::{NonZeroU32, NonZeroUsize}; +use std::result::Result as StdResult; use crossbeam_channel::{Receiver, Sender}; use heed::types::Str; @@ -25,13 +27,19 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use crate::documents::DocumentsBatchReader; +use self::validate::validate_documents_batch; +pub use self::validate::{ + extract_float_from_value, validate_document_id, validate_document_id_from_json, + validate_geo_from_json, +}; +use crate::documents::{obkv_to_object, DocumentsBatchReader}; +use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; use crate::update::{ self, Facets, IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixPairProximityDocids, WordPrefixPositionDocids, WordsPrefixesFst, }; -use crate::{Index, Result, RoaringBitmapCodec, UserError}; +use crate::{Index, Result, RoaringBitmapCodec}; static MERGED_DATABASE_COUNT: usize = 7; static PREFIX_DATABASE_COUNT: usize = 5; @@ -117,19 +125,27 @@ where /// Adds a batch of documents to the current builder. /// - /// Since the documents are progressively added to the writer, a failure will cause a stale - /// builder, and the builder must be discarded. + /// Since the documents are progressively added to the writer, a failure will cause only + /// return an error and not the `IndexDocuments` struct as it is invalid to use it afterward. /// /// Returns the number of documents added to the builder. - pub fn add_documents(&mut self, reader: DocumentsBatchReader) -> Result - where - R: Read + Seek, - { + pub fn add_documents( + mut self, + reader: DocumentsBatchReader, + ) -> Result<(Self, StdResult)> { // Early return when there is no document to add if reader.is_empty() { - return Ok(0); + return Ok((self, Ok(0))); } + // We check for user errors in this validator and if there is one, we can return + // the `IndexDocument` struct as it is valid to send more documents into it. + // However, if there is an internal error we throw it away! + let reader = match validate_documents_batch(self.wtxn, self.index, reader)? { + Ok(reader) => reader, + Err(user_error) => return Ok((self, Err(user_error))), + }; + let indexed_documents = self .transform .as_mut() @@ -139,7 +155,7 @@ where self.added_documents += indexed_documents; - Ok(indexed_documents) + Ok((self, Ok(indexed_documents))) } #[logging_timer::time("IndexDocuments::{}")] diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 42187fc1e..bc7eefd33 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,6 +17,7 @@ use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; +use crate::update::index_documents::validate_document_id_from_json; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -782,14 +783,6 @@ fn compute_primary_key_pair( } } -fn validate_document_id(document_id: &str) -> Option<&str> { - let document_id = document_id.trim(); - Some(document_id).filter(|id| { - !id.is_empty() - && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) - }) -} - /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. /// /// The size and alignment of T and U must match. @@ -813,22 +806,7 @@ fn update_primary_key<'a>( ) -> Result> { match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { Some((_, bytes)) => { - let value = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { - Value::String(string) => match validate_document_id(&string) { - Some(s) if s.len() == string.len() => string, - Some(s) => s.to_string(), - None => { - return Err(UserError::InvalidDocumentId { - document_id: Value::String(string), - } - .into()) - } - }, - Value::Number(number) => number.to_string(), - content => { - return Err(UserError::InvalidDocumentId { document_id: content.clone() }.into()) - } - }; + let value = validate_document_id_from_json(bytes)??; serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; Ok(Cow::Owned(value)) } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs new file mode 100644 index 000000000..b4c0cb68f --- /dev/null +++ b/milli/src/update/index_documents/validate.rs @@ -0,0 +1,140 @@ +use std::io::{Read, Seek}; +use std::result::Result as StdResult; + +use serde_json::Value; + +use crate::error::{GeoError, InternalError, UserError}; +use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader}; +use crate::{Index, Result}; + +/// This function validates a documents by checking that: +/// - we can infer a primary key, +/// - all the documents id exist and, +/// - the validity of them but also, +/// - the validity of the `_geo` field depending on the settings. +pub fn validate_documents_batch( + rtxn: &heed::RoTxn, + index: &Index, + reader: DocumentsBatchReader, +) -> Result, UserError>> { + let mut cursor = reader.into_cursor(); + let documents_batch_index = cursor.documents_batch_index().clone(); + + // The primary key *field id* that has already been set for this index or the one + // we will guess by searching for the first key that contains "id" as a substring. + let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { + Some(primary_key) => match documents_batch_index.id(primary_key) { + Some(id) => (primary_key, id), + None => { + return match cursor.next_document()? { + Some(first_document) => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&first_document, &documents_batch_index)?, + })), + // If there is no document in this batch the best we can do is to return this error. + None => Ok(Err(UserError::MissingPrimaryKey)), + }; + } + }, + None => { + let guessed = documents_batch_index + .iter() + .filter(|(_, name)| name.contains("id")) + .min_by_key(|(fid, _)| *fid); + match guessed { + Some((id, name)) => (name.as_str(), *id), + None => return Ok(Err(UserError::MissingPrimaryKey)), + } + } + }; + + // If the settings specifies that a _geo field must be used therefore we must check the + // validity of it in all the documents of this batch and this is when we return `Some`. + let geo_field_id = match documents_batch_index.id("_geo") { + Some(geo_field_id) if index.sortable_fields(rtxn)?.contains("_geo") => Some(geo_field_id), + _otherwise => None, + }; + + while let Some(document) = cursor.next_document()? { + let document_id = match document.get(primary_key_id) { + Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { + Ok(document_id) => document_id, + Err(user_error) => return Ok(Err(user_error)), + }, + None => { + return Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })) + } + }; + + if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { + if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { + return Ok(Err(UserError::from(user_error))); + } + } + } + + Ok(Ok(cursor.into_reader())) +} + +/// Returns a trimmed version of the document id or `None` if it is invalid. +pub fn validate_document_id(document_id: &str) -> Option<&str> { + let id = document_id.trim(); + if !id.is_empty() + && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + { + Some(id) + } else { + None + } +} + +/// Parses a Json encoded document id and validate it, returning a user error when it is one. +pub fn validate_document_id_from_json(bytes: &[u8]) -> Result> { + match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::String(string) => match validate_document_id(&string) { + Some(s) if s.len() == string.len() => Ok(Ok(string)), + Some(s) => Ok(Ok(s.to_string())), + None => { + return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })) + } + }, + Value::Number(number) => Ok(Ok(number.to_string())), + content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), + } +} + +/// Try to extract an `f64` from a JSON `Value` and return the `Value` +/// in the `Err` variant if it failed. +pub fn extract_float_from_value(value: Value) -> StdResult { + match value { + Value::Number(ref n) => n.as_f64().ok_or(value), + Value::String(ref s) => s.parse::().map_err(|_| value), + value => Err(value), + } +} + +pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { + let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { + (Some(lat), Some(lng)) => { + match (extract_float_from_value(lat), extract_float_from_value(lng)) { + (Ok(_), Ok(_)) => Ok(()), + (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }), + (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }), + (Err(lat), Err(lng)) => { + Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }) + } + } + } + (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }), + (Some(_), None) => Err(GeoError::MissingLongitude { document_id }), + (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }), + }, + value => Err(GeoError::NotAnObject { document_id, value }), + }; + + Ok(result) +} From fcfc4caf8c4512f9dce768017d95ac91804a90d2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 15:36:27 +0200 Subject: [PATCH 12/35] Move the Object type in the lib.rs file and use it everywhere --- benchmarks/benches/utils.rs | 20 ++++++++---------- cli/src/main.rs | 9 ++++---- http-ui/src/main.rs | 16 ++++++-------- milli/fuzz/fuzz_targets/indexing.rs | 2 +- milli/src/documents/builder.rs | 5 +++-- milli/src/error.rs | 6 ++---- milli/src/lib.rs | 21 ++++++++++--------- .../extract/extract_geo_points.rs | 1 - milli/src/update/index_documents/transform.rs | 2 +- milli/tests/search/facet_distribution.rs | 6 +++--- milli/tests/search/mod.rs | 6 +++--- 11 files changed, 43 insertions(+), 51 deletions(-) diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 091b9b0f5..630e17943 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -11,8 +11,8 @@ use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{ IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig, Settings, }; -use milli::{Filter, Index}; -use serde_json::{Map, Value}; +use milli::{Filter, Index, Object}; +use serde_json::Value; pub struct Conf<'a> { /// where we are going to create our database.mmdb directory @@ -96,12 +96,10 @@ pub fn base_setup(conf: &Conf) -> Index { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents_from(conf.dataset, conf.dataset_format); - - builder.add_documents(documents).unwrap(); - + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -156,7 +154,7 @@ pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -166,7 +164,7 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; @@ -221,14 +219,14 @@ impl CSVDocumentDeserializer { } impl Iterator for CSVDocumentDeserializer { - type Item = anyhow::Result>; + type Item = anyhow::Result; fn next(&mut self) -> Option { let csv_document = self.documents.next()?; match csv_document { Ok(csv_document) => { - let mut document = Map::new(); + let mut document = Object::new(); for ((field_name, field_type), value) in self.headers.iter().zip(csv_document.into_iter()) diff --git a/cli/src/main.rs b/cli/src/main.rs index dcd0f407a..db4ca91ab 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -13,8 +13,7 @@ use milli::update::UpdateIndexingStep::{ ComputeIdsAndMergeDocuments, IndexDocuments, MergeDataIntoFinalDatabase, RemapDocumentAddition, }; use milli::update::{self, IndexDocumentsConfig, IndexDocumentsMethod, IndexerConfig}; -use milli::Index; -use serde_json::{Map, Value}; +use milli::{Index, Object}; use structopt::StructOpt; #[cfg(target_os = "linux")] @@ -325,7 +324,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); let reader = BufReader::new(reader); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -335,7 +334,7 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; @@ -424,7 +423,7 @@ impl Search { filter: &Option, offset: &Option, limit: &Option, - ) -> Result>> { + ) -> Result> { let txn = index.read_txn()?; let mut search = index.search(&txn); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 63b9ee5e0..8167076c6 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -26,11 +26,11 @@ use milli::update::{ }; use milli::{ obkv_to_json, CompressionType, Filter as MilliFilter, FilterCondition, FormatOptions, Index, - MatcherBuilder, SearchResult, SortError, + MatcherBuilder, Object, SearchResult, SortError, }; use once_cell::sync::OnceCell; use serde::{Deserialize, Serialize}; -use serde_json::{Map, Value}; +use serde_json::Value; use structopt::StructOpt; use tokio::fs::File as TFile; use tokio::io::AsyncWriteExt; @@ -169,11 +169,7 @@ impl<'s, A: AsRef<[u8]>> Highlighter<'s, A> { } } - fn highlight_record( - &self, - object: &mut Map, - attributes_to_highlight: &HashSet, - ) { + fn highlight_record(&self, object: &mut Object, attributes_to_highlight: &HashSet) { // TODO do we need to create a string for element that are not and needs to be highlight? for (key, value) in object.iter_mut() { if attributes_to_highlight.contains(key) { @@ -708,7 +704,7 @@ async fn main() -> anyhow::Result<()> { #[derive(Debug, Serialize)] #[serde(rename_all = "camelCase")] struct Answer { - documents: Vec>, + documents: Vec, number_of_candidates: u64, facets: BTreeMap>, } @@ -1036,7 +1032,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); let reader = BufReader::new(reader); - for result in serde_json::Deserializer::from_reader(reader).into_iter::>() { + for result in serde_json::Deserializer::from_reader(reader).into_iter::() { let object = result?; documents.append_json_object(&object)?; } @@ -1046,7 +1042,7 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec> = serde_json::from_reader(reader)?; + let list: Vec = serde_json::from_reader(reader)?; for object in list { documents.append_json_object(&object)?; diff --git a/milli/fuzz/fuzz_targets/indexing.rs b/milli/fuzz/fuzz_targets/indexing.rs index 5c3b79ed7..e4f42655e 100644 --- a/milli/fuzz/fuzz_targets/indexing.rs +++ b/milli/fuzz/fuzz_targets/indexing.rs @@ -21,7 +21,7 @@ pub fn read_json(input: impl Read, writer: impl Write + Seek) -> Result { let writer = BufWriter::new(writer); let mut builder = DocumentsBatchBuilder::new(writer); - let values: Vec> = serde_json::from_reader(input)?; + let values: Vec = serde_json::from_reader(input)?; if builder.documents_count() == 0 { bail!("Empty payload"); } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 15a22090a..589e52269 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,10 @@ use std::io::{self, Write}; use grenad::{CompressionType, WriterBuilder}; -use serde_json::{to_writer, Map, Value}; +use serde_json::{to_writer, Value}; use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::Object; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary /// format used by milli. @@ -55,7 +56,7 @@ impl DocumentsBatchBuilder { } /// Appends a new JSON object into the batch and updates the `DocumentsBatchIndex` accordingly. - pub fn append_json_object(&mut self, object: &Map) -> io::Result<()> { + pub fn append_json_object(&mut self, object: &Object) -> io::Result<()> { // Make sure that we insert the fields ids in order as the obkv writer has this requirement. let mut fields_ids: Vec<_> = object.keys().map(|k| self.fields_index.insert(&k)).collect(); fields_ids.sort_unstable(); diff --git a/milli/src/error.rs b/milli/src/error.rs index d34130210..a23472951 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -4,12 +4,10 @@ use std::{io, str}; use heed::{Error as HeedError, MdbError}; use rayon::ThreadPoolBuildError; -use serde_json::{Map, Value}; +use serde_json::Value; use thiserror::Error; -use crate::{CriterionError, DocumentId, FieldId, SortError}; - -pub type Object = Map; +use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { ["_geo", "_geoDistance", "_geoPoint", "_geoRadius"].contains(&keyword) diff --git a/milli/src/lib.rs b/milli/src/lib.rs index 81cd057d5..a7be87183 100644 --- a/milli/src/lib.rs +++ b/milli/src/lib.rs @@ -20,7 +20,7 @@ use std::hash::BuildHasherDefault; pub use filter_parser::{Condition, FilterCondition}; use fxhash::{FxHasher32, FxHasher64}; pub use grenad::CompressionType; -use serde_json::{Map, Value}; +use serde_json::Value; pub use {charabia as tokenizer, heed}; pub use self::asc_desc::{AscDesc, AscDescError, Member, SortError}; @@ -43,20 +43,21 @@ pub use self::search::{ pub type Result = std::result::Result; +pub type Attribute = u32; +pub type BEU32 = heed::zerocopy::U32; +pub type BEU64 = heed::zerocopy::U64; +pub type DocumentId = u32; pub type FastMap4 = HashMap>; pub type FastMap8 = HashMap>; +pub type FieldDistribution = BTreeMap; +pub type FieldId = u16; +pub type Object = serde_json::Map; +pub type Position = u32; +pub type RelativePosition = u16; pub type SmallString32 = smallstr::SmallString<[u8; 32]>; pub type SmallVec16 = smallvec::SmallVec<[T; 16]>; pub type SmallVec32 = smallvec::SmallVec<[T; 32]>; pub type SmallVec8 = smallvec::SmallVec<[T; 8]>; -pub type BEU32 = heed::zerocopy::U32; -pub type BEU64 = heed::zerocopy::U64; -pub type Attribute = u32; -pub type DocumentId = u32; -pub type FieldId = u16; -pub type Position = u32; -pub type RelativePosition = u16; -pub type FieldDistribution = BTreeMap; /// A GeoPoint is a point in cartesian plan, called xyz_point in the code. Its metadata /// is a tuple composed of 1. the DocumentId of the associated document and 2. the original point @@ -82,7 +83,7 @@ pub fn obkv_to_json( displayed_fields: &[FieldId], fields_ids_map: &FieldsIdsMap, obkv: obkv::KvReaderU16, -) -> Result> { +) -> Result { displayed_fields .iter() .copied() diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 0f804b93b..46ef9ba9b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -1,6 +1,5 @@ use std::fs::File; use std::io; -use std::result::Result as StdResult; use concat_arrays::concat_arrays; use serde_json::Value; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index bc7eefd33..4ece58509 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -13,7 +13,7 @@ use serde_json::{Map, Value}; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; -use super::{validate_document_id, IndexDocumentsMethod, IndexerConfig}; +use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index 66713de1e..8890285e7 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -5,8 +5,8 @@ use heed::EnvOpenOptions; use maplit::hashset; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{FacetDistribution, Index}; -use serde_json::{Deserializer, Map, Value}; +use milli::{FacetDistribution, Index, Object}; +use serde_json::Deserializer; #[test] fn test_facet_distribution_with_no_facet_values() { @@ -46,7 +46,7 @@ fn test_facet_distribution_with_no_facet_values() { }"#, ); - for result in Deserializer::from_reader(reader).into_iter::>() { + for result in Deserializer::from_reader(reader).into_iter::() { let object = result.unwrap(); documents_builder.append_json_object(&object).unwrap(); } diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 4cf117dc7..0b6ce80cc 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -8,9 +8,9 @@ use heed::EnvOpenOptions; use maplit::{hashmap, hashset}; use milli::documents::{DocumentsBatchBuilder, DocumentsBatchReader}; use milli::update::{IndexDocuments, IndexDocumentsConfig, IndexerConfig, Settings}; -use milli::{AscDesc, Criterion, DocumentId, Index, Member}; +use milli::{AscDesc, Criterion, DocumentId, Index, Member, Object}; use serde::Deserialize; -use serde_json::{Deserializer, Map, Value}; +use serde_json::Deserializer; use slice_group_by::GroupBy; mod distinct; @@ -66,7 +66,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); - for result in Deserializer::from_reader(reader).into_iter::>() { + for result in Deserializer::from_reader(reader).into_iter::() { let object = result.unwrap(); documents_builder.append_json_object(&object).unwrap(); } From 399eec5c0101780f74b9794957865c3fe8b3d519 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 14:35:19 +0200 Subject: [PATCH 13/35] Fix the indexation tests --- benchmarks/benches/indexing.rs | 110 ++++++---- cli/src/main.rs | 7 +- http-ui/src/main.rs | 6 +- milli/src/index.rs | 25 ++- milli/src/search/distinct/mod.rs | 5 +- milli/src/search/facet/filter.rs | 5 +- milli/src/update/clear_documents.rs | 5 +- milli/src/update/delete_documents.rs | 18 +- milli/src/update/index_documents/mod.rs | 205 +++++++++++-------- milli/src/update/index_documents/validate.rs | 1 + milli/src/update/settings.rs | 70 ++++--- milli/tests/search/facet_distribution.rs | 6 +- milli/tests/search/mod.rs | 6 +- milli/tests/search/query_criteria.rs | 6 +- milli/tests/search/typo_tolerance.rs | 7 +- 15 files changed, 288 insertions(+), 194 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 3ae0a1a84..1b501b21a 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -132,12 +132,13 @@ fn indexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -223,11 +224,12 @@ fn deleting_songs_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -279,11 +281,12 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_1_2, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -294,19 +297,21 @@ fn indexing_songs_in_three_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_3_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS_4_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -339,13 +344,14 @@ fn indexing_songs_without_faceted_numbers(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -377,12 +383,13 @@ fn indexing_songs_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -415,12 +422,13 @@ fn indexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -507,11 +515,12 @@ fn deleting_wiki_in_batches_default(c: &mut Criterion) { let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -564,12 +573,13 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_1_2, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -581,24 +591,26 @@ fn indexing_wiki_in_three_batches(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_3_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES_4_4, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -631,12 +643,13 @@ fn indexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -720,11 +733,12 @@ fn deleting_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -775,12 +789,13 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { // as we don't care about the time it takes. let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_1_2, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -791,21 +806,23 @@ fn indexing_movies_in_three_batches(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_3_4, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES_4_4, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -861,12 +878,13 @@ fn indexing_nested_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -922,11 +940,12 @@ fn deleting_nested_movies_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -984,12 +1003,13 @@ fn indexing_nested_movies_without_faceted_fields(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::NESTED_MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1021,12 +1041,13 @@ fn indexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1113,11 +1134,12 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { let config = IndexerConfig::default(); let mut wtxn = index.write_txn().unwrap(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/cli/src/main.rs b/cli/src/main.rs index db4ca91ab..0d197af17 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -255,7 +255,7 @@ impl Performer for DocumentAddition { let bar = progesses.add(bar); bars.push(bar); } - let mut addition = milli::update::IndexDocuments::new( + let addition = milli::update::IndexDocuments::new( &mut txn, &index, &config, @@ -263,7 +263,10 @@ impl Performer for DocumentAddition { |step| indexing_callback(step, &bars), ) .unwrap(); - addition.add_documents(reader)?; + let (addition, user_error) = addition.add_documents(reader)?; + if let Err(error) = user_error { + return Err(error.into()); + } std::thread::spawn(move || { progesses.join().unwrap(); diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 8167076c6..117aa31e8 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -374,7 +374,7 @@ async fn main() -> anyhow::Result<()> { }); }; - let mut builder = milli::update::IndexDocuments::new( + let builder = milli::update::IndexDocuments::new( &mut wtxn, &index_cloned, GLOBAL_CONFIG.get().unwrap(), @@ -397,8 +397,8 @@ async fn main() -> anyhow::Result<()> { let documents = DocumentsBatchReader::from_reader(Cursor::new(documents))?; - builder.add_documents(documents)?; - + let (builder, user_error) = builder.add_documents(documents)?; + let _count = user_error?; let result = builder.execute(); match result { diff --git a/milli/src/index.rs b/milli/src/index.rs index 9637b4103..272877912 100644 --- a/milli/src/index.rs +++ b/milli/src/index.rs @@ -1212,10 +1212,11 @@ pub(crate) mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1234,7 +1235,7 @@ pub(crate) mod tests { // we add all the documents a second time. we are supposed to get the same // field_distribution in the end let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); let content = documents!([ @@ -1242,7 +1243,8 @@ pub(crate) mod tests { { "id": 2, "name": "bob", "age": 20 }, { "id": 2, "name": "bob", "age": 20 }, ]); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1265,10 +1267,11 @@ pub(crate) mod tests { ]); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1333,10 +1336,11 @@ pub(crate) mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1390,10 +1394,11 @@ pub(crate) mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/search/distinct/mod.rs b/milli/src/search/distinct/mod.rs index 670fa01ac..1a9c56cf3 100644 --- a/milli/src/search/distinct/mod.rs +++ b/milli/src/search/distinct/mod.rs @@ -97,14 +97,15 @@ mod test { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut addition = + let addition = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); let reader = crate::documents::DocumentsBatchReader::from_reader(Cursor::new(JSON.as_slice())) .unwrap(); - addition.add_documents(reader).unwrap(); + let (addition, user_error) = addition.add_documents(reader).unwrap(); + user_error.unwrap(); addition.execute().unwrap(); let fields_map = index.fields_ids_map(&txn).unwrap(); diff --git a/milli/src/search/facet/filter.rs b/milli/src/search/facet/filter.rs index d89413f62..41e2f0657 100644 --- a/milli/src/search/facet/filter.rs +++ b/milli/src/search/facet/filter.rs @@ -648,10 +648,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/src/update/clear_documents.rs b/milli/src/update/clear_documents.rs index d1939df7b..3fe57eeae 100644 --- a/milli/src/update/clear_documents.rs +++ b/milli/src/update/clear_documents.rs @@ -100,9 +100,10 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); let config = IndexerConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Clear all documents from the database. diff --git a/milli/src/update/delete_documents.rs b/milli/src/update/delete_documents.rs index 3b519c101..49e7de8ae 100644 --- a/milli/src/update/delete_documents.rs +++ b/milli/src/update/delete_documents.rs @@ -657,13 +657,13 @@ mod tests { fn insert_documents<'t, R: std::io::Read + std::io::Seek>( wtxn: &mut RwTxn<'t, '_>, index: &'t Index, - documents: crate::documents::DocumentBatchReader, + documents: crate::documents::DocumentsBatchReader, ) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let builder = IndexDocuments::new(wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); } @@ -701,9 +701,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // delete those documents, ids are synchronous therefore 0, 1, and 2. @@ -736,9 +737,10 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Delete not all of the documents but some of them. diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 2fb7cbcd9..ae42483f0 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -141,7 +141,12 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let reader = match validate_documents_batch(self.wtxn, self.index, reader)? { + let reader = match validate_documents_batch( + self.wtxn, + self.index, + self.config.autogenerate_docids, + reader, + )? { Ok(reader) => reader, Err(user_error) => return Ok((self, Err(user_error))), }; @@ -626,10 +631,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -642,10 +648,11 @@ mod tests { // Second we send 1 document with id 1, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "name": "updated kevin" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -662,9 +669,11 @@ mod tests { { "id": 2, "name": "updated kevina" }, { "id": 3, "name": "updated benoit" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); wtxn.commit().unwrap(); // Check that there is **always** 3 documents. @@ -694,10 +703,11 @@ mod tests { update_method: IndexDocumentsMethod::UpdateDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -722,9 +732,10 @@ mod tests { // Second we send 1 document with id 1, to force it to be merged with the previous one. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "id": 1, "age": 25 } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -765,7 +776,7 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); assert!(builder.add_documents(content).is_err()); wtxn.commit().unwrap(); @@ -794,10 +805,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -814,9 +826,10 @@ mod tests { // Second we send 1 document with the generated uuid, to erase the previous ones. let mut wtxn = index.write_txn().unwrap(); let content = documents!([ { "name": "updated kevin", "id": kevin_uuid } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -856,9 +869,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -873,9 +887,10 @@ mod tests { let content = documents!([ { "name": "new kevin" } ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -898,9 +913,10 @@ mod tests { let content = documents!([]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -924,7 +940,7 @@ mod tests { let content = documents!([ { "id": "brume bleue", "name": "kevin" } ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); assert!(builder.add_documents(content).is_err()); @@ -934,9 +950,10 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); // There is a space in the document id. let content = documents!([ { "id": 32, "name": "kevin" } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -963,9 +980,10 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1009,9 +1027,10 @@ mod tests { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1020,7 +1039,7 @@ mod tests { update_method: IndexDocumentsMethod::UpdateDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let documents = documents!([ { @@ -1030,7 +1049,8 @@ mod tests { } ]); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -1057,9 +1077,10 @@ mod tests { update_method: IndexDocumentsMethod::ReplaceDocuments, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1099,10 +1120,11 @@ mod tests { { "id": 2, "_geo": { "lng": "42" }, "_geo.lat": "31" }, { "id": 3, "_geo.lat": 31, "_geo.lng": "42" }, ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1138,10 +1160,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1151,10 +1174,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1164,10 +1188,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": "lol", "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1177,10 +1202,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": [12, 13], "lng": 42 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1190,10 +1216,11 @@ mod tests { let documents = documents!([ { "id": 0, "_geo": { "lat": 12, "lng": "hello" } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), @@ -1217,10 +1244,11 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); assert_eq!(index.primary_key(&wtxn).unwrap(), Some("objectId")); @@ -1237,10 +1265,11 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); let external_documents_ids = index.external_documents_ids(&wtxn).unwrap(); assert!(external_documents_ids.get("30").is_some()); @@ -1249,10 +1278,11 @@ mod tests { { "objectId": 30, "title": "Hamlet", "_geo": { "lat": 12, "lng": 89 } } ]); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1281,10 +1311,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1315,10 +1346,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1373,10 +1405,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1426,10 +1459,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1558,10 +1592,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1628,10 +1663,11 @@ mod tests { // index the documents let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1720,10 +1756,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1737,10 +1774,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1759,10 +1797,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1787,10 +1826,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1832,10 +1872,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); } @@ -1870,10 +1911,11 @@ mod tests { let mut wtxn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // Create one long document @@ -1884,10 +1926,11 @@ mod tests { // Index this one long document let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1901,7 +1944,7 @@ mod tests { let index = Index::new(options, tmp).unwrap(); let mut wtxn = index.write_txn().unwrap(); let indexer_config = IndexerConfig::default(); - let mut builder = IndexDocuments::new( + let builder = IndexDocuments::new( &mut wtxn, &index, &indexer_config, @@ -1930,8 +1973,10 @@ mod tests { "branch_id_number": 0 }]}; - builder.add_documents(doc1).unwrap(); - builder.add_documents(doc2).unwrap(); + let (builder, user_error) = builder.add_documents(doc1).unwrap(); + user_error.unwrap(); + let (builder, user_error) = builder.add_documents(doc2).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index b4c0cb68f..0ed1f1cc0 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -15,6 +15,7 @@ use crate::{Index, Result}; pub fn validate_documents_batch( rtxn: &heed::RoTxn, index: &Index, + autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { let mut cursor = reader.into_cursor(); diff --git a/milli/src/update/settings.rs b/milli/src/update/settings.rs index ccf29eb49..5f39579b7 100644 --- a/milli/src/update/settings.rs +++ b/milli/src/update/settings.rs @@ -735,10 +735,11 @@ mod tests { ]); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -798,10 +799,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -850,10 +852,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -880,10 +883,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction we change the displayed fields to be only the age. @@ -934,10 +938,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -974,10 +979,11 @@ mod tests { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1016,10 +1022,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1067,10 +1074,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1110,10 +1118,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1142,10 +1151,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1172,10 +1182,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction we provide some stop_words @@ -1251,10 +1262,11 @@ mod tests { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); // In the same transaction provide some synonyms @@ -1389,10 +1401,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1452,10 +1465,11 @@ mod tests { ]); let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/facet_distribution.rs b/milli/tests/search/facet_distribution.rs index 8890285e7..83d692d7f 100644 --- a/milli/tests/search/facet_distribution.rs +++ b/milli/tests/search/facet_distribution.rs @@ -29,8 +29,7 @@ fn test_facet_distribution_with_no_facet_values() { let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new( r#"{ @@ -55,7 +54,8 @@ fn test_facet_distribution_with_no_facet_values() { // index documents let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/mod.rs b/milli/tests/search/mod.rs index 0b6ce80cc..3b8960fcc 100644 --- a/milli/tests/search/mod.rs +++ b/milli/tests/search/mod.rs @@ -61,8 +61,7 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut documents_builder = DocumentsBatchBuilder::new(Vec::new()); let reader = Cursor::new(CONTENT.as_bytes()); @@ -75,7 +74,8 @@ pub fn setup_search_index_with_criteria(criteria: &[Criterion]) -> Index { // index documents let content = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(content).unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/query_criteria.rs b/milli/tests/search/query_criteria.rs index 89a6a6eec..a96366f5e 100644 --- a/milli/tests/search/query_criteria.rs +++ b/milli/tests/search/query_criteria.rs @@ -390,8 +390,7 @@ fn criteria_ascdesc() { // index documents let config = IndexerConfig { max_memory: Some(10 * 1024 * 1024), ..Default::default() }; let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; - let mut builder = - IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); let mut batch_builder = DocumentsBatchBuilder::new(Vec::new()); @@ -422,7 +421,8 @@ fn criteria_ascdesc() { let vector = batch_builder.into_inner().unwrap(); let reader = DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap(); - builder.add_documents(reader).unwrap(); + let (builder, user_error) = builder.add_documents(reader).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/milli/tests/search/typo_tolerance.rs b/milli/tests/search/typo_tolerance.rs index 63bf22579..7c4cf8971 100644 --- a/milli/tests/search/typo_tolerance.rs +++ b/milli/tests/search/typo_tolerance.rs @@ -127,11 +127,10 @@ fn test_typo_disabled_on_word() { let mut txn = index.write_txn().unwrap(); let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); - let mut builder = - IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); - - builder.add_documents(documents).unwrap(); + let builder = IndexDocuments::new(&mut txn, &index, &config, indexing_config, |_| ()).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); txn.commit().unwrap(); From 2ceeb51c37a98e8233edab175467e11101ef5dd2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 15:14:20 +0200 Subject: [PATCH 14/35] Support the auto-generated ids when validating documents --- milli/src/update/index_documents/mod.rs | 6 ++++-- milli/src/update/index_documents/validate.rs | 12 +++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ae42483f0..1a1fc9a0e 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -778,7 +778,8 @@ mod tests { let indexing_config = IndexDocumentsConfig::default(); let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()).unwrap(); - assert!(builder.add_documents(content).is_err()); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + assert!(user_error.is_err()); wtxn.commit().unwrap(); // Check that there is no document. @@ -943,7 +944,8 @@ mod tests { let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) .unwrap(); - assert!(builder.add_documents(content).is_err()); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + assert!(user_error.is_err()); wtxn.commit().unwrap(); // First we send 1 document with a valid id. diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 0ed1f1cc0..0cb0b4aff 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -19,20 +19,20 @@ pub fn validate_documents_batch( reader: DocumentsBatchReader, ) -> Result, UserError>> { let mut cursor = reader.into_cursor(); - let documents_batch_index = cursor.documents_batch_index().clone(); + let mut documents_batch_index = cursor.documents_batch_index().clone(); // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { Some(primary_key) => match documents_batch_index.id(primary_key) { Some(id) => (primary_key, id), + None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)), None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), document: obkv_to_object(&first_document, &documents_batch_index)?, })), - // If there is no document in this batch the best we can do is to return this error. None => Ok(Err(UserError::MissingPrimaryKey)), }; } @@ -40,10 +40,11 @@ pub fn validate_documents_batch( None => { let guessed = documents_batch_index .iter() - .filter(|(_, name)| name.contains("id")) + .filter(|(_, name)| name.to_lowercase().contains("id")) .min_by_key(|(fid, _)| *fid); match guessed { Some((id, name)) => (name.as_str(), *id), + None if autogenerate_docids => ("id", documents_batch_index.insert("id")), None => return Ok(Err(UserError::MissingPrimaryKey)), } } @@ -56,12 +57,16 @@ pub fn validate_documents_batch( _otherwise => None, }; + let mut count = 0; while let Some(document) = cursor.next_document()? { let document_id = match document.get(primary_key_id) { Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { Ok(document_id) => document_id, Err(user_error) => return Ok(Err(user_error)), }, + None if autogenerate_docids => { + format!("{{auto-generated id of the {}nth document}}", count) + } None => { return Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), @@ -75,6 +80,7 @@ pub fn validate_documents_batch( return Ok(Err(UserError::from(user_error))); } } + count += 1; } Ok(Ok(cursor.into_reader())) From 19eb3b4708c1cd2619d2a77cefbcda7ff5e4bda5 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 16:06:52 +0200 Subject: [PATCH 15/35] Make sur that we do not accept floats as documents ids --- milli/src/update/index_documents/mod.rs | 47 ++++++++++++++++++++ milli/src/update/index_documents/validate.rs | 2 +- 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 1a1fc9a0e..5bce3b851 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1987,4 +1987,51 @@ mod tests { assert_eq!(ids.len(), map.len()); } + + #[test] + fn primary_key_must_not_contain_floats() { + let tmp = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(4096 * 100); + let index = Index::new(options, tmp).unwrap(); + let mut wtxn = index.write_txn().unwrap(); + let indexer_config = IndexerConfig::default(); + let builder = IndexDocuments::new( + &mut wtxn, + &index, + &indexer_config, + IndexDocumentsConfig::default(), + |_| (), + ) + .unwrap(); + + let doc1 = documents! {[{ + "id": -228142, + "title": "asdsad", + }]}; + + let doc2 = documents! {[{ + "id": 228143.56, + "title": "something", + }]}; + + let doc3 = documents! {[{ + "id": -228143.56, + "title": "something", + }]}; + + let doc4 = documents! {[{ + "id": 2.0, + "title": "something", + }]}; + + let (builder, user_error) = builder.add_documents(doc1).unwrap(); + user_error.unwrap(); + let (builder, user_error) = builder.add_documents(doc2).unwrap(); + assert!(user_error.is_err()); + let (builder, user_error) = builder.add_documents(doc3).unwrap(); + assert!(user_error.is_err()); + let (_builder, user_error) = builder.add_documents(doc4).unwrap(); + assert!(user_error.is_err()); + } } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 0cb0b4aff..c69c754ac 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -108,7 +108,7 @@ pub fn validate_document_id_from_json(bytes: &[u8]) -> Result Ok(Ok(number.to_string())), + Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), } } From 8ebf5eed0d80beb623a056d4e70a5d0535cd7181 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 15 Jun 2022 17:58:52 +0200 Subject: [PATCH 16/35] Make the nested primary key work --- milli/src/error.rs | 2 + milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 5 +- milli/src/update/index_documents/validate.rs | 228 ++++++++++++++---- 4 files changed, 191 insertions(+), 46 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index a23472951..d05acbe1c 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -121,6 +121,8 @@ only composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and undersco MaxDatabaseSizeReached, #[error("Document doesn't have a `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] MissingDocumentId { primary_key: String, document: Object }, + #[error("Document have too many matching `{}` attribute: `{}`.", .primary_key, serde_json::to_string(.document).unwrap())] + TooManyDocumentIds { primary_key: String, document: Object }, #[error("The primary key inference process failed because the engine did not find any fields containing `id` substring in their name. If your document identifier does not contain any `id` substring, you can set the primary key of the index.")] MissingPrimaryKey, #[error("There is no more space left on the device. Consider increasing the size of the disk/partition.")] diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 5bce3b851..ba1064684 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -29,7 +29,7 @@ use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; use self::validate::validate_documents_batch; pub use self::validate::{ - extract_float_from_value, validate_document_id, validate_document_id_from_json, + extract_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, }; use crate::documents::{obkv_to_object, DocumentsBatchReader}; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4ece58509..38f6dc8ff 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -17,7 +17,7 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::index_documents::validate_document_id_from_json; +use crate::update::index_documents::validate_document_id_value; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, @@ -806,7 +806,8 @@ fn update_primary_key<'a>( ) -> Result> { match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { Some((_, bytes)) => { - let value = validate_document_id_from_json(bytes)??; + let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?; + let value = validate_document_id_value(document_id)??; serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; Ok(Cow::Owned(value)) } diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index c69c754ac..32e8de03f 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -1,11 +1,16 @@ use std::io::{Read, Seek}; +use std::iter; use std::result::Result as StdResult; use serde_json::Value; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; use crate::error::{GeoError, InternalError, UserError}; -use crate::update::index_documents::{obkv_to_object, DocumentsBatchReader}; -use crate::{Index, Result}; +use crate::update::index_documents::obkv_to_object; +use crate::{FieldId, Index, Object, Result}; + +/// The symbol used to define levels in a nested primary key. +const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; /// This function validates a documents by checking that: /// - we can infer a primary key, @@ -23,10 +28,15 @@ pub fn validate_documents_batch( // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. - let (primary_key, primary_key_id) = match index.primary_key(rtxn)? { + let primary_key = match index.primary_key(rtxn)? { + Some(primary_key) if primary_key.contains(PRIMARY_KEY_SPLIT_SYMBOL) => { + PrimaryKey::nested(primary_key) + } Some(primary_key) => match documents_batch_index.id(primary_key) { - Some(id) => (primary_key, id), - None if autogenerate_docids => (primary_key, documents_batch_index.insert(primary_key)), + Some(id) => PrimaryKey::flat(primary_key, id), + None if autogenerate_docids => { + PrimaryKey::flat(primary_key, documents_batch_index.insert(primary_key)) + } None => { return match cursor.next_document()? { Some(first_document) => Ok(Err(UserError::MissingDocumentId { @@ -43,8 +53,10 @@ pub fn validate_documents_batch( .filter(|(_, name)| name.to_lowercase().contains("id")) .min_by_key(|(fid, _)| *fid); match guessed { - Some((id, name)) => (name.as_str(), *id), - None if autogenerate_docids => ("id", documents_batch_index.insert("id")), + Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), + None if autogenerate_docids => { + PrimaryKey::flat("id", documents_batch_index.insert("id")) + } None => return Ok(Err(UserError::MissingPrimaryKey)), } } @@ -59,20 +71,15 @@ pub fn validate_documents_batch( let mut count = 0; while let Some(document) = cursor.next_document()? { - let document_id = match document.get(primary_key_id) { - Some(document_id_bytes) => match validate_document_id_from_json(document_id_bytes)? { - Ok(document_id) => document_id, - Err(user_error) => return Ok(Err(user_error)), - }, - None if autogenerate_docids => { - format!("{{auto-generated id of the {}nth document}}", count) - } - None => { - return Ok(Err(UserError::MissingDocumentId { - primary_key: primary_key.to_string(), - document: obkv_to_object(&document, &documents_batch_index)?, - })) - } + let document_id = match fetch_document_id( + &document, + &documents_batch_index, + primary_key, + autogenerate_docids, + count, + )? { + Ok(document_id) => document_id, + Err(user_error) => return Ok(Err(user_error)), }; if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { @@ -86,30 +93,167 @@ pub fn validate_documents_batch( Ok(Ok(cursor.into_reader())) } +/// Retrieve the document id after validating it, returning a `UserError` +/// if the id is invalid or can't be guessed. +fn fetch_document_id( + document: &obkv::KvReader, + documents_batch_index: &DocumentsBatchIndex, + primary_key: PrimaryKey, + autogenerate_docids: bool, + count: usize, +) -> Result> { + match primary_key { + PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { + match document.get(primary_key_id) { + Some(document_id_bytes) => { + let document_id = serde_json::from_slice(document_id_bytes) + .map_err(InternalError::SerdeJson)?; + match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => Ok(Err(user_error)), + } + } + None if autogenerate_docids => { + Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count))) + } + None => Ok(Err(UserError::MissingDocumentId { + primary_key: primary_key.to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })), + } + } + nested @ PrimaryKey::Nested { .. } => { + let mut matching_documents_ids = Vec::new(); + for (first_level_name, right) in nested.possible_level_names() { + if let Some(field_id) = documents_batch_index.id(first_level_name) { + if let Some(value_bytes) = document.get(field_id) { + let object = serde_json::from_slice(value_bytes) + .map_err(InternalError::SerdeJson)?; + fetch_matching_values(object, right, &mut matching_documents_ids); + + if matching_documents_ids.len() >= 2 { + return Ok(Err(UserError::TooManyDocumentIds { + primary_key: nested.primary_key().to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })); + } + } + } + } + + match matching_documents_ids.pop() { + Some(document_id) => match validate_document_id_value(document_id)? { + Ok(document_id) => Ok(Ok(document_id)), + Err(user_error) => Ok(Err(user_error)), + }, + None => Ok(Err(UserError::MissingDocumentId { + primary_key: nested.primary_key().to_string(), + document: obkv_to_object(&document, &documents_batch_index)?, + })), + } + } + } +} + +/// A type that represent the type of primary key that has been set +/// for this index, a classic flat one or a nested one. +#[derive(Debug, Clone, Copy)] +enum PrimaryKey<'a> { + Flat { name: &'a str, field_id: FieldId }, + Nested { name: &'a str }, +} + +impl PrimaryKey<'_> { + fn flat(name: &str, field_id: FieldId) -> PrimaryKey { + PrimaryKey::Flat { name, field_id } + } + + fn nested(name: &str) -> PrimaryKey { + PrimaryKey::Nested { name } + } + + fn primary_key(&self) -> &str { + match self { + PrimaryKey::Flat { name, .. } => name, + PrimaryKey::Nested { name } => name, + } + } + + /// Returns an `Iterator` that gives all the possible fields names the primary key + /// can have depending of the first level name and deepnes of the objects. + fn possible_level_names(&self) -> impl Iterator + '_ { + let name = self.primary_key(); + iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) + } +} + +fn contained_in(selector: &str, key: &str) -> bool { + selector.starts_with(key) + && selector[key.len()..] + .chars() + .next() + .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL) + .unwrap_or(true) +} + +pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { + match value { + Value::Object(object) => fetch_matching_values_in_object(object, selector, "", output), + otherwise => output.push(otherwise), + } +} + +pub fn fetch_matching_values_in_object( + object: Object, + selector: &str, + base_key: &str, + output: &mut Vec, +) { + for (key, value) in object { + let base_key = if base_key.is_empty() { + key.to_string() + } else { + format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) + }; + + // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` + // so we check the contained_in on both side. + let should_continue = + contained_in(selector, &base_key) || contained_in(&base_key, selector); + + if should_continue { + match value { + Value::Object(object) => { + fetch_matching_values_in_object(object, selector, &base_key, output) + } + value => output.push(value), + } + } + } +} + /// Returns a trimmed version of the document id or `None` if it is invalid. pub fn validate_document_id(document_id: &str) -> Option<&str> { - let id = document_id.trim(); - if !id.is_empty() - && id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) + let document_id = document_id.trim(); + if !document_id.is_empty() + && document_id.chars().all(|c| matches!(c, 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_')) { - Some(id) + Some(document_id) } else { None } } /// Parses a Json encoded document id and validate it, returning a user error when it is one. -pub fn validate_document_id_from_json(bytes: &[u8]) -> Result> { - match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { +pub fn validate_document_id_value(document_id: Value) -> Result> { + match document_id { Value::String(string) => match validate_document_id(&string) { Some(s) if s.len() == string.len() => Ok(Ok(string)), Some(s) => Ok(Ok(s.to_string())), - None => { - return Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })) - } + None => Ok(Err(UserError::InvalidDocumentId { document_id: Value::String(string) })), }, Value::Number(number) if number.is_i64() => Ok(Ok(number.to_string())), - content => return Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), + content => Ok(Err(UserError::InvalidDocumentId { document_id: content.clone() })), } } @@ -124,24 +268,22 @@ pub fn extract_float_from_value(value: Value) -> StdResult { } pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { - let result = match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { + match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { match (extract_float_from_value(lat), extract_float_from_value(lng)) { - (Ok(_), Ok(_)) => Ok(()), - (Err(value), Ok(_)) => Err(GeoError::BadLatitude { document_id, value }), - (Ok(_), Err(value)) => Err(GeoError::BadLongitude { document_id, value }), + (Ok(_), Ok(_)) => Ok(Ok(())), + (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })), + (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })), (Err(lat), Err(lng)) => { - Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng }) + Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })) } } } - (None, Some(_)) => Err(GeoError::MissingLatitude { document_id }), - (Some(_), None) => Err(GeoError::MissingLongitude { document_id }), - (None, None) => Err(GeoError::MissingLatitudeAndLongitude { document_id }), + (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })), + (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })), + (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })), }, - value => Err(GeoError::NotAnObject { document_id, value }), - }; - - Ok(result) + value => Ok(Err(GeoError::NotAnObject { document_id, value })), + } } From dc3f092d0782fd7b39bb29aa424caa04ada8de3c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 16 Jun 2022 12:03:43 +0200 Subject: [PATCH 17/35] Do not leak an internal grenad Error --- milli/src/documents/mod.rs | 2 +- milli/src/documents/reader.rs | 32 ++++++++++++++++++++++++++++++-- milli/src/error.rs | 7 +++++++ 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index ee3593bf8..66a05b7b6 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -7,7 +7,7 @@ use std::io; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; use obkv::KvReader; -pub use reader::{DocumentsBatchCursor, DocumentsBatchReader}; +pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; use crate::error::{FieldIdMapMissingEntry, InternalError}; diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 3dff999f5..720b403b9 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::io; +use std::{error, fmt, io}; use obkv::KvReader; @@ -79,7 +79,9 @@ impl DocumentsBatchCursor { impl DocumentsBatchCursor { /// Returns the next document, starting from the first one. Subsequent calls to /// `next_document` advance the document reader until all the documents have been read. - pub fn next_document(&mut self) -> Result>, grenad::Error> { + pub fn next_document( + &mut self, + ) -> Result>, DocumentsBatchCursorError> { match self.cursor.move_on_next()? { Some((key, value)) if key != DOCUMENTS_BATCH_INDEX_KEY => { Ok(Some(KvReader::new(value))) @@ -88,3 +90,29 @@ impl DocumentsBatchCursor { } } } + +/// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. +#[derive(Debug)] +pub struct DocumentsBatchCursorError { + inner: grenad::Error, +} + +impl From for DocumentsBatchCursorError { + fn from(error: grenad::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError { inner: error } + } +} + +impl Into for DocumentsBatchCursorError { + fn into(self) -> grenad::Error { + self.inner + } +} + +impl error::Error for DocumentsBatchCursorError {} + +impl fmt::Display for DocumentsBatchCursorError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + self.inner.fmt(f) + } +} diff --git a/milli/src/error.rs b/milli/src/error.rs index d05acbe1c..d9dca287d 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,6 +7,7 @@ use rayon::ThreadPoolBuildError; use serde_json::Value; use thiserror::Error; +use crate::documents::DocumentsBatchCursorError; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { @@ -209,6 +210,12 @@ where } } +impl From for Error { + fn from(error: DocumentsBatchCursorError) -> Error { + Error::from(Into::::into(error)) + } +} + impl From for Error { fn from(_error: Infallible) -> Error { unreachable!() From ea852200bbb9d9520ab7e13e5f6019c3b7c7fed3 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Wed, 22 Jun 2022 10:28:49 +0200 Subject: [PATCH 18/35] Fix the format used for a geo deleting benchmark --- benchmarks/benches/indexing.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 1b501b21a..80c7ba0ed 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -1137,7 +1137,7 @@ fn deleting_geo_in_batches_default(c: &mut Criterion) { let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); - let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "json"); + let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); let (builder, user_error) = builder.add_documents(documents).unwrap(); user_error.unwrap(); builder.execute().unwrap(); From 6a0a0ae94f38ee04ffed9bd252444128163b1550 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 20 Jun 2022 13:48:02 +0200 Subject: [PATCH 19/35] Make the Transform read from an EnrichedDocumentsBatchReader --- milli/src/documents/enriched.rs | 103 ++++++++++++++++++ milli/src/documents/mod.rs | 13 +++ milli/src/documents/reader.rs | 20 ++-- milli/src/error.rs | 10 +- milli/src/update/index_documents/mod.rs | 6 +- milli/src/update/index_documents/transform.rs | 8 +- milli/src/update/index_documents/validate.rs | 22 ++-- 7 files changed, 158 insertions(+), 24 deletions(-) create mode 100644 milli/src/documents/enriched.rs diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs new file mode 100644 index 000000000..8645e06c4 --- /dev/null +++ b/milli/src/documents/enriched.rs @@ -0,0 +1,103 @@ +use std::fs::File; +use std::{io, str}; + +use obkv::KvReader; + +use super::{ + DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, + Error, +}; +use crate::FieldId; + +/// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have +/// been created with a `DocumentsBatchWriter` and, for the enriched data, +/// a simple `grenad::Reader`. +/// +/// The documents are returned in the form of `obkv::Reader` where each field is identified with a +/// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. +pub struct EnrichedDocumentsBatchReader { + documents: DocumentsBatchReader, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchReader { + pub fn new( + documents: DocumentsBatchReader, + external_ids: grenad::Reader, + ) -> Result { + if documents.documents_count() as u64 == external_ids.len() { + Ok(EnrichedDocumentsBatchReader { + documents, + external_ids: external_ids.into_cursor()?, + }) + } else { + Err(Error::InvalidEnrichedData) + } + } + + pub fn documents_count(&self) -> u32 { + self.documents.documents_count() + } + + pub fn is_empty(&self) -> bool { + self.documents.is_empty() + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + self.documents.documents_batch_index() + } + + /// This method returns a forward cursor over the enriched documents. + pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { + let EnrichedDocumentsBatchReader { documents, mut external_ids } = self; + external_ids.reset(); + EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids } + } +} + +#[derive(Debug, Clone, Copy)] +pub struct EnrichedDocument<'a> { + pub document: KvReader<'a, FieldId>, + pub external_id: &'a str, +} + +pub struct EnrichedDocumentsBatchCursor { + documents: DocumentsBatchCursor, + external_ids: grenad::ReaderCursor, +} + +impl EnrichedDocumentsBatchCursor { + pub fn into_reader(self) -> EnrichedDocumentsBatchReader { + let EnrichedDocumentsBatchCursor { documents, external_ids } = self; + EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids } + } + + pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { + self.documents.documents_batch_index() + } + + /// Resets the cursor to be able to read from the start again. + pub fn reset(&mut self) { + self.documents.reset(); + self.external_ids.reset(); + } +} + +impl EnrichedDocumentsBatchCursor { + /// Returns the next document, starting from the first one. Subsequent calls to + /// `next_document` advance the document reader until all the documents have been read. + pub fn next_enriched_document( + &mut self, + ) -> Result, DocumentsBatchCursorError> { + let document = self.documents.next_document()?; + let external_id = match self.external_ids.move_on_next()? { + Some((_, bytes)) => Some(str::from_utf8(bytes)?), + None => None, + }; + + match document.zip(external_id) { + Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })), + None => Ok(None), + } + } +} diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 66a05b7b6..43bfc1c20 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,11 +1,14 @@ mod builder; +mod enriched; mod reader; use std::fmt::{self, Debug}; use std::io; +use std::str::Utf8Error; use bimap::BiHashMap; pub use builder::DocumentsBatchBuilder; +pub use enriched::{EnrichedDocument, EnrichedDocumentsBatchCursor, EnrichedDocumentsBatchReader}; use obkv::KvReader; pub use reader::{DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchReader}; use serde::{Deserialize, Serialize}; @@ -87,6 +90,8 @@ impl DocumentsBatchIndex { pub enum Error { ParseFloat { error: std::num::ParseFloatError, line: usize, value: String }, InvalidDocumentFormat, + InvalidEnrichedData, + InvalidUtf8(Utf8Error), Csv(csv::Error), Json(serde_json::Error), Serialize(serde_json::Error), @@ -118,6 +123,12 @@ impl From for Error { } } +impl From for Error { + fn from(other: Utf8Error) -> Self { + Self::InvalidUtf8(other) + } +} + impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match self { @@ -127,6 +138,8 @@ impl fmt::Display for Error { Error::InvalidDocumentFormat => { f.write_str("Invalid document addition format, missing the documents batch index.") } + Error::InvalidEnrichedData => f.write_str("Invalid enriched data."), + Error::InvalidUtf8(e) => write!(f, "{}", e), Error::Io(e) => write!(f, "{}", e), Error::Serialize(e) => write!(f, "{}", e), Error::Grenad(e) => write!(f, "{}", e), diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 720b403b9..7bd6dbd51 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::{error, fmt, io}; +use std::{error, fmt, io, str}; use obkv::KvReader; @@ -93,19 +93,20 @@ impl DocumentsBatchCursor { /// The possible error thrown by the `DocumentsBatchCursor` when iterating on the documents. #[derive(Debug)] -pub struct DocumentsBatchCursorError { - inner: grenad::Error, +pub enum DocumentsBatchCursorError { + Grenad(grenad::Error), + Utf8(str::Utf8Error), } impl From for DocumentsBatchCursorError { fn from(error: grenad::Error) -> DocumentsBatchCursorError { - DocumentsBatchCursorError { inner: error } + DocumentsBatchCursorError::Grenad(error) } } -impl Into for DocumentsBatchCursorError { - fn into(self) -> grenad::Error { - self.inner +impl From for DocumentsBatchCursorError { + fn from(error: str::Utf8Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::Utf8(error) } } @@ -113,6 +114,9 @@ impl error::Error for DocumentsBatchCursorError {} impl fmt::Display for DocumentsBatchCursorError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - self.inner.fmt(f) + match self { + DocumentsBatchCursorError::Grenad(e) => e.fmt(f), + DocumentsBatchCursorError::Utf8(e) => e.fmt(f), + } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index d9dca287d..0419ceeda 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -7,7 +7,7 @@ use rayon::ThreadPoolBuildError; use serde_json::Value; use thiserror::Error; -use crate::documents::DocumentsBatchCursorError; +use crate::documents::{self, DocumentsBatchCursorError}; use crate::{CriterionError, DocumentId, FieldId, Object, SortError}; pub fn is_reserved_keyword(keyword: &str) -> bool { @@ -36,6 +36,8 @@ pub enum InternalError { FieldIdMappingMissingEntry { key: FieldId }, #[error(transparent)] Fst(#[from] fst::Error), + #[error(transparent)] + DocumentsError(#[from] documents::Error), #[error("Invalid compression type have been specified to grenad.")] GrenadInvalidCompressionType, #[error("Invalid grenad file with an invalid version format.")] @@ -185,6 +187,7 @@ macro_rules! error_from_sub_error { error_from_sub_error! { FieldIdMapMissingEntry => InternalError, fst::Error => InternalError, + documents::Error => InternalError, str::Utf8Error => InternalError, ThreadPoolBuildError => InternalError, SerializationError => InternalError, @@ -212,7 +215,10 @@ where impl From for Error { fn from(error: DocumentsBatchCursorError) -> Error { - Error::from(Into::::into(error)) + match error { + DocumentsBatchCursorError::Grenad(e) => Error::from(e), + DocumentsBatchCursorError::Utf8(e) => Error::from(e), + } } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index ba1064684..fe3bd1f8f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -27,7 +27,7 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use self::validate::validate_documents_batch; +use self::validate::validate_and_enrich_documents_batch; pub use self::validate::{ extract_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, @@ -141,7 +141,7 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let reader = match validate_documents_batch( + let enriched_documents_reader = match validate_and_enrich_documents_batch( self.wtxn, self.index, self.config.autogenerate_docids, @@ -155,7 +155,7 @@ where .transform .as_mut() .expect("Invalid document addition state") - .read_documents(reader, self.wtxn, &self.progress)? + .read_documents(enriched_documents_reader, self.wtxn, &self.progress)? as u64; self.added_documents += indexed_documents; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 38f6dc8ff..4d0a4c311 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -14,7 +14,7 @@ use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; use super::{IndexDocumentsMethod, IndexerConfig}; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; +use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; use crate::update::index_documents::validate_document_id_value; @@ -153,7 +153,7 @@ impl<'a, 'i> Transform<'a, 'i> { pub fn read_documents( &mut self, - reader: DocumentsBatchReader, + reader: EnrichedDocumentsBatchReader, wtxn: &mut heed::RwTxn, progress_callback: F, ) -> Result @@ -189,7 +189,9 @@ impl<'a, 'i> Transform<'a, 'i> { let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); let addition_index = cursor.documents_batch_index().clone(); - while let Some(document) = cursor.next_document()? { + while let Some(enriched_document) = cursor.next_enriched_document()? { + let EnrichedDocument { document, external_id } = enriched_document; + let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 32e8de03f..8b68532cb 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -4,27 +4,28 @@ use std::result::Result as StdResult; use serde_json::Value; -use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader}; +use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; use crate::error::{GeoError, InternalError, UserError}; -use crate::update::index_documents::obkv_to_object; +use crate::update::index_documents::{obkv_to_object, writer_into_reader}; use crate::{FieldId, Index, Object, Result}; /// The symbol used to define levels in a nested primary key. const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; -/// This function validates a documents by checking that: +/// This function validates and enrich the documents by checking that: /// - we can infer a primary key, -/// - all the documents id exist and, +/// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. -pub fn validate_documents_batch( +pub fn validate_and_enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, autogenerate_docids: bool, reader: DocumentsBatchReader, -) -> Result, UserError>> { +) -> Result, UserError>> { let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); + let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -82,6 +83,8 @@ pub fn validate_documents_batch( Err(user_error) => return Ok(Err(user_error)), }; + external_ids.insert(count.to_be_bytes(), &document_id)?; + if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { return Ok(Err(UserError::from(user_error))); @@ -90,7 +93,10 @@ pub fn validate_documents_batch( count += 1; } - Ok(Ok(cursor.into_reader())) + let external_ids = writer_into_reader(external_ids)?; + let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?; + + Ok(Ok(reader)) } /// Retrieve the document id after validating it, returning a `UserError` @@ -100,7 +106,7 @@ fn fetch_document_id( documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, - count: usize, + count: u32, ) -> Result> { match primary_key { PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { From 5f1bfb73eeeb1da17a1d2598d01916b08022aa8c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 10:45:25 +0200 Subject: [PATCH 20/35] Extract the primary key name and make it accessible --- milli/src/documents/enriched.rs | 28 +++++++++++++++++--- milli/src/update/index_documents/validate.rs | 6 ++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 8645e06c4..918b47c95 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -17,17 +17,20 @@ use crate::FieldId; /// `FieldId`. The mapping between the field ids and the field names is done thanks to the index. pub struct EnrichedDocumentsBatchReader { documents: DocumentsBatchReader, + primary_key: String, external_ids: grenad::ReaderCursor, } impl EnrichedDocumentsBatchReader { pub fn new( documents: DocumentsBatchReader, + primary_key: String, external_ids: grenad::Reader, ) -> Result { if documents.documents_count() as u64 == external_ids.len() { Ok(EnrichedDocumentsBatchReader { documents, + primary_key, external_ids: external_ids.into_cursor()?, }) } else { @@ -39,6 +42,10 @@ impl EnrichedDocumentsBatchReader { self.documents.documents_count() } + pub fn primary_key(&self) -> &str { + &self.primary_key + } + pub fn is_empty(&self) -> bool { self.documents.is_empty() } @@ -49,9 +56,13 @@ impl EnrichedDocumentsBatchReader { /// This method returns a forward cursor over the enriched documents. pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { - let EnrichedDocumentsBatchReader { documents, mut external_ids } = self; + let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; external_ids.reset(); - EnrichedDocumentsBatchCursor { documents: documents.into_cursor(), external_ids } + EnrichedDocumentsBatchCursor { + documents: documents.into_cursor(), + primary_key, + external_ids, + } } } @@ -63,13 +74,22 @@ pub struct EnrichedDocument<'a> { pub struct EnrichedDocumentsBatchCursor { documents: DocumentsBatchCursor, + primary_key: String, external_ids: grenad::ReaderCursor, } impl EnrichedDocumentsBatchCursor { pub fn into_reader(self) -> EnrichedDocumentsBatchReader { - let EnrichedDocumentsBatchCursor { documents, external_ids } = self; - EnrichedDocumentsBatchReader { documents: documents.into_reader(), external_ids } + let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self; + EnrichedDocumentsBatchReader { + documents: documents.into_reader(), + primary_key, + external_ids, + } + } + + pub fn primary_key(&self) -> &str { + &self.primary_key } pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 8b68532cb..83d7ef38e 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -94,7 +94,11 @@ pub fn validate_and_enrich_documents_batch( } let external_ids = writer_into_reader(external_ids)?; - let reader = EnrichedDocumentsBatchReader::new(cursor.into_reader(), external_ids)?; + let reader = EnrichedDocumentsBatchReader::new( + cursor.into_reader(), + primary_key.primary_key().to_string(), + external_ids, + )?; Ok(Ok(reader)) } From 742543091e9b33084952ad27e00eff60655d7bf0 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 10:48:07 +0200 Subject: [PATCH 21/35] Constify the default primary key name --- milli/src/update/index_documents/validate.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/validate.rs index 83d7ef38e..4e52f4cb9 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/validate.rs @@ -12,6 +12,9 @@ use crate::{FieldId, Index, Object, Result}; /// The symbol used to define levels in a nested primary key. const PRIMARY_KEY_SPLIT_SYMBOL: char = '.'; +/// The default primary that is used when not specified. +const DEFAULT_PRIMARY_KEY: &str = "id"; + /// This function validates and enrich the documents by checking that: /// - we can infer a primary key, /// - all the documents id exist and are extracted, @@ -51,13 +54,14 @@ pub fn validate_and_enrich_documents_batch( None => { let guessed = documents_batch_index .iter() - .filter(|(_, name)| name.to_lowercase().contains("id")) + .filter(|(_, name)| name.to_lowercase().contains(DEFAULT_PRIMARY_KEY)) .min_by_key(|(fid, _)| *fid); match guessed { Some((id, name)) => PrimaryKey::flat(name.as_str(), *id), - None if autogenerate_docids => { - PrimaryKey::flat("id", documents_batch_index.insert("id")) - } + None if autogenerate_docids => PrimaryKey::flat( + DEFAULT_PRIMARY_KEY, + documents_batch_index.insert(DEFAULT_PRIMARY_KEY), + ), None => return Ok(Err(UserError::MissingPrimaryKey)), } } From 905af2a2e9f69b2341d532b105bbf1b3baff819e Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:12:51 +0200 Subject: [PATCH 22/35] Use the primary key and external id in the transform --- milli/src/update/index_documents/transform.rs | 290 +++++------------- 1 file changed, 79 insertions(+), 211 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 4d0a4c311..e82556ec7 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -9,7 +9,7 @@ use heed::RoTxn; use itertools::Itertools; use obkv::{KvReader, KvWriter}; use roaring::RoaringBitmap; -use serde_json::{Map, Value}; +use serde_json::Value; use smartstring::SmartString; use super::helpers::{create_sorter, create_writer, keep_latest_obkv, merge_obkvs, MergeFn}; @@ -17,15 +17,12 @@ use super::{IndexDocumentsMethod, IndexerConfig}; use crate::documents::{DocumentsBatchIndex, EnrichedDocument, EnrichedDocumentsBatchReader}; use crate::error::{Error, InternalError, UserError}; use crate::index::db_name; -use crate::update::index_documents::validate_document_id_value; use crate::update::{AvailableDocumentsIds, UpdateIndexingStep}; use crate::{ ExternalDocumentsIds, FieldDistribution, FieldId, FieldIdMapMissingEntry, FieldsIdsMap, Index, Result, BEU32, }; -const DEFAULT_PRIMARY_KEY_NAME: &str = "id"; - pub struct TransformOutput { pub primary_key: String, pub fields_ids_map: FieldsIdsMap, @@ -85,18 +82,6 @@ fn create_fields_mapping( .collect() } -/// Look for a key containing the [DEFAULT_PRIMARY_KEY_NAME] in the fields. -/// It doesn't look in the subfield because we don't want to enable the -/// primary key inference on nested objects. -fn find_primary_key(index: &DocumentsBatchIndex) -> Option<&str> { - index - .iter() - .sorted_by_key(|(k, _)| *k) - .map(|(_, v)| v) - .find(|v| v.to_lowercase().contains(DEFAULT_PRIMARY_KEY_NAME)) - .map(String::as_str) -} - impl<'a, 'i> Transform<'a, 'i> { pub fn new( wtxn: &mut heed::RwTxn, @@ -167,28 +152,15 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; - let alternative_name = self - .index - .primary_key(wtxn)? - .or_else(|| find_primary_key(fields_index)) - .map(String::from); - - let (primary_key_id, primary_key_name) = compute_primary_key_pair( - self.index.primary_key(wtxn)?, - &mut self.fields_ids_map, - alternative_name, - self.autogenerate_docids, - )?; - - let primary_key_id_nested = primary_key_name.contains('.'); + let primary_key = cursor.primary_key().to_string(); + self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; + let primary_key_id_nested = primary_key.contains('.'); let mut flattened_document = None; let mut obkv_buffer = Vec::new(); let mut flattened_obkv_buffer = Vec::new(); let mut documents_count = 0; - let mut external_id_buffer = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); - let addition_index = cursor.documents_batch_index().clone(); while let Some(enriched_document) = cursor.next_enriched_document()? { let EnrichedDocument { document, external_id } = enriched_document; @@ -210,8 +182,7 @@ impl<'a, 'i> Transform<'a, 'i> { // it, transform it into a string and validate it, and then update it in the // document. If none is found, and we were told to generate missing document ids, then // we create the missing field, and update the new document. - let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; - let external_id = if primary_key_id_nested { + if primary_key_id_nested { let mut field_buffer_cache = field_buffer_cache.clone(); self.flatten_from_field_mapping( &mapping, @@ -220,29 +191,6 @@ impl<'a, 'i> Transform<'a, 'i> { &mut field_buffer_cache, )?; flattened_document = Some(&flattened_obkv_buffer); - let document = KvReader::new(&flattened_obkv_buffer); - - update_primary_key( - document, - &addition_index, - primary_key_id, - &primary_key_name, - &mut uuid_buffer, - &mut field_buffer_cache, - &mut external_id_buffer, - self.autogenerate_docids, - )? - } else { - update_primary_key( - document, - &addition_index, - primary_key_id, - &primary_key_name, - &mut uuid_buffer, - &mut field_buffer_cache, - &mut external_id_buffer, - self.autogenerate_docids, - )? }; // Insertion in a obkv need to be done with keys ordered. For now they are ordered @@ -318,7 +266,6 @@ impl<'a, 'i> Transform<'a, 'i> { }); field_buffer = drop_and_reuse(field_buffer_cache); - external_id_buffer.clear(); obkv_buffer.clear(); } @@ -327,7 +274,7 @@ impl<'a, 'i> Transform<'a, 'i> { }); self.index.put_fields_ids_map(wtxn, &self.fields_ids_map)?; - self.index.put_primary_key(wtxn, &primary_key_name)?; + self.index.put_primary_key(wtxn, &primary_key)?; self.documents_count += documents_count; // Now that we have a valid sorter that contains the user id and the obkv we // give it to the last transforming function which returns the TransformOutput. @@ -749,42 +696,6 @@ impl<'a, 'i> Transform<'a, 'i> { } } -/// Given an optional primary key and an optional alternative name, returns the (field_id, attr_name) -/// for the primary key according to the following rules: -/// - if primary_key is `Some`, returns the id and the name, else -/// - if alternative_name is Some, adds alternative to the fields_ids_map, and returns the pair, else -/// - if autogenerate_docids is true, insert the default id value in the field ids map ("id") and -/// returns the pair, else -/// - returns an error. -fn compute_primary_key_pair( - primary_key: Option<&str>, - fields_ids_map: &mut FieldsIdsMap, - alternative_name: Option, - autogenerate_docids: bool, -) -> Result<(FieldId, String)> { - match primary_key { - Some(primary_key) => { - let id = fields_ids_map.insert(primary_key).ok_or(UserError::AttributeLimitReached)?; - Ok((id, primary_key.to_string())) - } - None => { - let name = match alternative_name { - Some(key) => key, - None => { - if !autogenerate_docids { - // If there is no primary key in the current document batch, we must - // return an error and not automatically generate any document id. - return Err(UserError::MissingPrimaryKey.into()); - } - DEFAULT_PRIMARY_KEY_NAME.to_string() - } - }; - let id = fields_ids_map.insert(&name).ok_or(UserError::AttributeLimitReached)?; - Ok((id, name)) - } - } -} - /// Drops all the value of type `U` in vec, and reuses the allocation to create a `Vec`. /// /// The size and alignment of T and U must match. @@ -796,49 +707,6 @@ fn drop_and_reuse(mut vec: Vec) -> Vec { vec.into_iter().map(|_| unreachable!()).collect() } -fn update_primary_key<'a>( - document: KvReader<'a, FieldId>, - addition_index: &DocumentsBatchIndex, - primary_key_id: FieldId, - primary_key_name: &str, - uuid_buffer: &'a mut [u8; uuid::fmt::Hyphenated::LENGTH], - field_buffer_cache: &mut Vec<(u16, Cow<'a, [u8]>)>, - mut external_id_buffer: &'a mut Vec, - autogenerate_docids: bool, -) -> Result> { - match field_buffer_cache.iter_mut().find(|(id, _)| *id == primary_key_id) { - Some((_, bytes)) => { - let document_id = serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)?; - let value = validate_document_id_value(document_id)??; - serde_json::to_writer(external_id_buffer, &value).map_err(InternalError::SerdeJson)?; - Ok(Cow::Owned(value)) - } - None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); - serde_json::to_writer(&mut external_id_buffer, &uuid) - .map_err(InternalError::SerdeJson)?; - field_buffer_cache.push((primary_key_id, external_id_buffer.as_slice().into())); - Ok(Cow::Borrowed(&*uuid)) - } - None => { - let mut json = Map::new(); - for (key, value) in document.iter() { - let key = addition_index.name(key).map(ToString::to_string); - let value = serde_json::from_slice::(&value).ok(); - - if let Some((k, v)) = key.zip(value) { - json.insert(k, v); - } - } - - Err(UserError::MissingDocumentId { - primary_key: primary_key_name.to_string(), - document: json, - })? - } - } -} - impl TransformOutput { // find and insert the new field ids pub fn compute_real_facets(&self, rtxn: &RoTxn, index: &Index) -> Result> { @@ -853,87 +721,87 @@ impl TransformOutput { } } -#[cfg(test)] -mod test { - use super::*; +// #[cfg(test)] +// mod test { +// use super::*; - mod compute_primary_key { - use big_s::S; +// mod compute_primary_key { +// use big_s::S; - use super::{compute_primary_key_pair, FieldsIdsMap}; +// use super::{compute_primary_key_pair, FieldsIdsMap}; - #[test] - fn should_return_primary_key_if_is_some() { - let mut fields_map = FieldsIdsMap::new(); - fields_map.insert("toto").unwrap(); - let result = compute_primary_key_pair( - Some("toto"), - &mut fields_map, - Some("tata".to_string()), - false, - ); - assert_eq!(result.unwrap(), (0, "toto".to_string())); - assert_eq!(fields_map.len(), 1); +// #[test] +// fn should_return_primary_key_if_is_some() { +// let mut fields_map = FieldsIdsMap::new(); +// fields_map.insert("toto").unwrap(); +// let result = compute_primary_key_pair( +// Some("toto"), +// &mut fields_map, +// Some("tata".to_string()), +// false, +// ); +// assert_eq!(result.unwrap(), (0, "toto".to_string())); +// assert_eq!(fields_map.len(), 1); - // and with nested fields - let mut fields_map = FieldsIdsMap::new(); - fields_map.insert("toto.tata").unwrap(); - let result = compute_primary_key_pair( - Some("toto.tata"), - &mut fields_map, - Some(S("titi")), - false, - ); - assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); - assert_eq!(fields_map.len(), 1); - } +// // and with nested fields +// let mut fields_map = FieldsIdsMap::new(); +// fields_map.insert("toto.tata").unwrap(); +// let result = compute_primary_key_pair( +// Some("toto.tata"), +// &mut fields_map, +// Some(S("titi")), +// false, +// ); +// assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); +// assert_eq!(fields_map.len(), 1); +// } - #[test] - fn should_return_alternative_if_primary_is_none() { - let mut fields_map = FieldsIdsMap::new(); - let result = - compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); - assert_eq!(result.unwrap(), (0, S("tata"))); - assert_eq!(fields_map.len(), 1); - } +// #[test] +// fn should_return_alternative_if_primary_is_none() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = +// compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); +// assert_eq!(result.unwrap(), (0, S("tata"))); +// assert_eq!(fields_map.len(), 1); +// } - #[test] - fn should_return_default_if_both_are_none() { - let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair(None, &mut fields_map, None, true); - assert_eq!(result.unwrap(), (0, S("id"))); - assert_eq!(fields_map.len(), 1); - } +// #[test] +// fn should_return_default_if_both_are_none() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = compute_primary_key_pair(None, &mut fields_map, None, true); +// assert_eq!(result.unwrap(), (0, S("id"))); +// assert_eq!(fields_map.len(), 1); +// } - #[test] - fn should_return_err_if_both_are_none_and_recompute_is_false() { - let mut fields_map = FieldsIdsMap::new(); - let result = compute_primary_key_pair(None, &mut fields_map, None, false); - assert!(result.is_err()); - assert_eq!(fields_map.len(), 0); - } - } +// #[test] +// fn should_return_err_if_both_are_none_and_recompute_is_false() { +// let mut fields_map = FieldsIdsMap::new(); +// let result = compute_primary_key_pair(None, &mut fields_map, None, false); +// assert!(result.is_err()); +// assert_eq!(fields_map.len(), 0); +// } +// } - mod primary_key_inference { - use big_s::S; - use bimap::BiHashMap; +// mod primary_key_inference { +// use big_s::S; +// use bimap::BiHashMap; - use crate::documents::DocumentsBatchIndex; - use crate::update::index_documents::transform::find_primary_key; +// use crate::documents::DocumentsBatchIndex; +// use crate::update::index_documents::transform::find_primary_key; - #[test] - fn primary_key_infered_on_first_field() { - // We run the test multiple times to change the order in which the fields are iterated upon. - for _ in 1..50 { - let mut map = BiHashMap::new(); - map.insert(1, S("fakeId")); - map.insert(2, S("fakeId")); - map.insert(3, S("fakeId")); - map.insert(4, S("fakeId")); - map.insert(0, S("realId")); +// #[test] +// fn primary_key_infered_on_first_field() { +// // We run the test multiple times to change the order in which the fields are iterated upon. +// for _ in 1..50 { +// let mut map = BiHashMap::new(); +// map.insert(1, S("fakeId")); +// map.insert(2, S("fakeId")); +// map.insert(3, S("fakeId")); +// map.insert(4, S("fakeId")); +// map.insert(0, S("realId")); - assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); - } - } - } -} +// assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); +// } +// } +// } +// } From c8ebf0de47e09964a9b4060b1da9982a593040e1 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:14:14 +0200 Subject: [PATCH 23/35] Rename the validate function as an enriching function --- .../index_documents/{validate.rs => enrich.rs} | 2 +- milli/src/update/index_documents/mod.rs | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) rename milli/src/update/index_documents/{validate.rs => enrich.rs} (99%) diff --git a/milli/src/update/index_documents/validate.rs b/milli/src/update/index_documents/enrich.rs similarity index 99% rename from milli/src/update/index_documents/validate.rs rename to milli/src/update/index_documents/enrich.rs index 4e52f4cb9..e3c3bd6f6 100644 --- a/milli/src/update/index_documents/validate.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -20,7 +20,7 @@ const DEFAULT_PRIMARY_KEY: &str = "id"; /// - all the documents id exist and are extracted, /// - the validity of them but also, /// - the validity of the `_geo` field depending on the settings. -pub fn validate_and_enrich_documents_batch( +pub fn enrich_documents_batch( rtxn: &heed::RoTxn, index: &Index, autogenerate_docids: bool, diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index fe3bd1f8f..db1a768e6 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1,8 +1,8 @@ +mod enrich; mod extract; mod helpers; mod transform; mod typed_chunk; -mod validate; use std::collections::HashSet; use std::io::{Cursor, Read, Seek}; @@ -19,6 +19,11 @@ use serde::{Deserialize, Serialize}; use slice_group_by::GroupBy; use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; +use self::enrich::enrich_documents_batch; +pub use self::enrich::{ + extract_float_from_value, validate_document_id, validate_document_id_value, + validate_geo_from_json, +}; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, fst_stream_into_vec, merge_cbo_roaring_bitmaps, merge_roaring_bitmaps, @@ -27,11 +32,6 @@ pub use self::helpers::{ }; use self::helpers::{grenad_obkv_into_chunks, GrenadParameters}; pub use self::transform::{Transform, TransformOutput}; -use self::validate::validate_and_enrich_documents_batch; -pub use self::validate::{ - extract_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, -}; use crate::documents::{obkv_to_object, DocumentsBatchReader}; use crate::error::UserError; pub use crate::update::index_documents::helpers::CursorClonableMmap; @@ -141,7 +141,7 @@ where // We check for user errors in this validator and if there is one, we can return // the `IndexDocument` struct as it is valid to send more documents into it. // However, if there is an internal error we throw it away! - let enriched_documents_reader = match validate_and_enrich_documents_batch( + let enriched_documents_reader = match enrich_documents_batch( self.wtxn, self.index, self.config.autogenerate_docids, From d1a4da98127207b7ce560280911480a4e0a2ced4 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 11:16:59 +0200 Subject: [PATCH 24/35] Generate a real UUIDv4 when ids are auto-generated --- milli/src/update/index_documents/enrich.rs | 93 ++++++++++++++----- .../extract/extract_geo_points.rs | 14 +-- 2 files changed, 78 insertions(+), 29 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index e3c3bd6f6..5d00565a8 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -1,6 +1,6 @@ use std::io::{Read, Seek}; -use std::iter; use std::result::Result as StdResult; +use std::{fmt, iter}; use serde_json::Value; @@ -29,6 +29,7 @@ pub fn enrich_documents_batch( let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; + let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -76,31 +77,33 @@ pub fn enrich_documents_batch( let mut count = 0; while let Some(document) = cursor.next_document()? { - let document_id = match fetch_document_id( + let document_id = match fetch_or_generate_document_id( &document, &documents_batch_index, primary_key, autogenerate_docids, + &mut uuid_buffer, count, )? { Ok(document_id) => document_id, Err(user_error) => return Ok(Err(user_error)), }; - external_ids.insert(count.to_be_bytes(), &document_id)?; + external_ids.insert(count.to_be_bytes(), document_id.value())?; if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { - if let Err(user_error) = validate_geo_from_json(Value::from(document_id), geo_value)? { + if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { return Ok(Err(UserError::from(user_error))); } } + count += 1; } let external_ids = writer_into_reader(external_ids)?; let reader = EnrichedDocumentsBatchReader::new( cursor.into_reader(), - primary_key.primary_key().to_string(), + primary_key.name().to_string(), external_ids, )?; @@ -109,13 +112,14 @@ pub fn enrich_documents_batch( /// Retrieve the document id after validating it, returning a `UserError` /// if the id is invalid or can't be guessed. -fn fetch_document_id( +fn fetch_or_generate_document_id( document: &obkv::KvReader, documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, + uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH], count: u32, -) -> Result> { +) -> Result> { match primary_key { PrimaryKey::Flat { name: primary_key, field_id: primary_key_id } => { match document.get(primary_key_id) { @@ -123,12 +127,13 @@ fn fetch_document_id( let document_id = serde_json::from_slice(document_id_bytes) .map_err(InternalError::SerdeJson)?; match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(document_id)), + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), Err(user_error) => Ok(Err(user_error)), } } None if autogenerate_docids => { - Ok(Ok(format!("{{auto-generated id of the {}nth document}}", count))) + let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + Ok(Ok(DocumentId::generated(uuid.to_string(), count))) } None => Ok(Err(UserError::MissingDocumentId { primary_key: primary_key.to_string(), @@ -147,7 +152,7 @@ fn fetch_document_id( if matching_documents_ids.len() >= 2 { return Ok(Err(UserError::TooManyDocumentIds { - primary_key: nested.primary_key().to_string(), + primary_key: nested.name().to_string(), document: obkv_to_object(&document, &documents_batch_index)?, })); } @@ -157,11 +162,11 @@ fn fetch_document_id( match matching_documents_ids.pop() { Some(document_id) => match validate_document_id_value(document_id)? { - Ok(document_id) => Ok(Ok(document_id)), + Ok(document_id) => Ok(Ok(DocumentId::retrieved(document_id))), Err(user_error) => Ok(Err(user_error)), }, None => Ok(Err(UserError::MissingDocumentId { - primary_key: nested.primary_key().to_string(), + primary_key: nested.name().to_string(), document: obkv_to_object(&document, &documents_batch_index)?, })), } @@ -186,7 +191,7 @@ impl PrimaryKey<'_> { PrimaryKey::Nested { name } } - fn primary_key(&self) -> &str { + fn name(&self) -> &str { match self { PrimaryKey::Flat { name, .. } => name, PrimaryKey::Nested { name } => name, @@ -196,11 +201,53 @@ impl PrimaryKey<'_> { /// Returns an `Iterator` that gives all the possible fields names the primary key /// can have depending of the first level name and deepnes of the objects. fn possible_level_names(&self) -> impl Iterator + '_ { - let name = self.primary_key(); + let name = self.name(); iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) } } +/// A type that represents a document id that has been retrieved from a document or auto-generated. +/// +/// In case the document id has been auto-generated, the document nth is kept to help +/// users debug if there is an issue with the document itself. +#[derive(Clone)] +pub enum DocumentId { + Retrieved { value: String }, + Generated { value: String, document_nth: u32 }, +} + +impl DocumentId { + fn retrieved(value: String) -> DocumentId { + DocumentId::Retrieved { value } + } + + fn generated(value: String, document_nth: u32) -> DocumentId { + DocumentId::Generated { value, document_nth } + } + + fn value(&self) -> &str { + match self { + DocumentId::Retrieved { value } => value, + DocumentId::Generated { value, .. } => value, + } + } + + fn debug(&self) -> String { + format!("{:?}", self) + } +} + +impl fmt::Debug for DocumentId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + DocumentId::Retrieved { value } => write!(f, "{:?}", value), + DocumentId::Generated { value, document_nth } => { + write!(f, "{{{:?}}} of the {}nth document", value, document_nth) + } + } + } +} + fn contained_in(selector: &str, key: &str) -> bool { selector.starts_with(key) && selector[key.len()..] @@ -281,23 +328,25 @@ pub fn extract_float_from_value(value: Value) -> StdResult { } } -pub fn validate_geo_from_json(document_id: Value, bytes: &[u8]) -> Result> { +pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result> { + use GeoError::*; + let debug_id = || Value::from(id.debug()); match serde_json::from_slice(bytes).map_err(InternalError::SerdeJson)? { Value::Object(mut object) => match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { match (extract_float_from_value(lat), extract_float_from_value(lng)) { (Ok(_), Ok(_)) => Ok(Ok(())), - (Err(value), Ok(_)) => Ok(Err(GeoError::BadLatitude { document_id, value })), - (Ok(_), Err(value)) => Ok(Err(GeoError::BadLongitude { document_id, value })), + (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), + (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), (Err(lat), Err(lng)) => { - Ok(Err(GeoError::BadLatitudeAndLongitude { document_id, lat, lng })) + Ok(Err(BadLatitudeAndLongitude { document_id: debug_id(), lat, lng })) } } } - (None, Some(_)) => Ok(Err(GeoError::MissingLatitude { document_id })), - (Some(_), None) => Ok(Err(GeoError::MissingLongitude { document_id })), - (None, None) => Ok(Err(GeoError::MissingLatitudeAndLongitude { document_id })), + (None, Some(_)) => Ok(Err(MissingLatitude { document_id: debug_id() })), + (Some(_), None) => Ok(Err(MissingLongitude { document_id: debug_id() })), + (None, None) => Ok(Err(MissingLatitudeAndLongitude { document_id: debug_id() })), }, - value => Ok(Err(GeoError::NotAnObject { document_id, value })), + value => Ok(Err(NotAnObject { document_id: debug_id(), value })), } } diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 46ef9ba9b..5a6de236b 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -29,9 +29,9 @@ pub fn extract_geo_points( let obkv = obkv::KvReader::new(value); // since we only needs the primary key when we throw an error we create this getter to // lazily get it when needed - let primary_key = || -> Value { - let primary_key = obkv.get(primary_key_id).unwrap(); - serde_json::from_slice(primary_key).unwrap() + let document_id = || -> Value { + let document_id = obkv.get(primary_key_id).unwrap(); + serde_json::from_slice(document_id).unwrap() }; // first we get the two fields @@ -43,19 +43,19 @@ pub fn extract_geo_points( let lat = extract_float_from_value( serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, ) - .map_err(|lat| GeoError::BadLatitude { document_id: primary_key(), value: lat })?; + .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; let lng = extract_float_from_value( serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, ) - .map_err(|lng| GeoError::BadLongitude { document_id: primary_key(), value: lng })?; + .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; let bytes: [u8; 16] = concat_arrays![lat.to_ne_bytes(), lng.to_ne_bytes()]; writer.insert(docid_bytes, bytes)?; } else if lat.is_none() && lng.is_some() { - return Err(GeoError::MissingLatitude { document_id: primary_key() })?; + return Err(GeoError::MissingLatitude { document_id: document_id() })?; } else if lat.is_some() && lng.is_none() { - return Err(GeoError::MissingLongitude { document_id: primary_key() })?; + return Err(GeoError::MissingLongitude { document_id: document_id() })?; } } From 0bbcc7b1808061355202efe040c38f64de64ba50 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 21 Jun 2022 14:41:19 +0200 Subject: [PATCH 25/35] Expose the `DocumentId` struct to be sure to inject the generated ids --- milli/src/documents/enriched.rs | 13 ++++++----- milli/src/documents/reader.rs | 12 +++++----- milli/src/error.rs | 2 +- milli/src/update/index_documents/enrich.rs | 22 ++++++++++++------- milli/src/update/index_documents/mod.rs | 2 +- milli/src/update/index_documents/transform.rs | 13 +++++++++-- milli/src/update/mod.rs | 2 +- 7 files changed, 41 insertions(+), 25 deletions(-) diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 918b47c95..4f45a891a 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -7,6 +7,7 @@ use super::{ DocumentsBatchCursor, DocumentsBatchCursorError, DocumentsBatchIndex, DocumentsBatchReader, Error, }; +use crate::update::DocumentId; use crate::FieldId; /// The `EnrichedDocumentsBatchReader` provides a way to iterate over documents that have @@ -66,10 +67,10 @@ impl EnrichedDocumentsBatchReader { } } -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone)] pub struct EnrichedDocument<'a> { pub document: KvReader<'a, FieldId>, - pub external_id: &'a str, + pub document_id: DocumentId, } pub struct EnrichedDocumentsBatchCursor { @@ -110,13 +111,13 @@ impl EnrichedDocumentsBatchCursor { &mut self, ) -> Result, DocumentsBatchCursorError> { let document = self.documents.next_document()?; - let external_id = match self.external_ids.move_on_next()? { - Some((_, bytes)) => Some(str::from_utf8(bytes)?), + let document_id = match self.external_ids.move_on_next()? { + Some((_, bytes)) => serde_json::from_slice(bytes).map(Some)?, None => None, }; - match document.zip(external_id) { - Some((document, external_id)) => Ok(Some(EnrichedDocument { document, external_id })), + match document.zip(document_id) { + Some((document, document_id)) => Ok(Some(EnrichedDocument { document, document_id })), None => Ok(None), } } diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 7bd6dbd51..70b8b0131 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -1,5 +1,5 @@ use std::convert::TryInto; -use std::{error, fmt, io, str}; +use std::{error, fmt, io}; use obkv::KvReader; @@ -95,7 +95,7 @@ impl DocumentsBatchCursor { #[derive(Debug)] pub enum DocumentsBatchCursorError { Grenad(grenad::Error), - Utf8(str::Utf8Error), + SerdeJson(serde_json::Error), } impl From for DocumentsBatchCursorError { @@ -104,9 +104,9 @@ impl From for DocumentsBatchCursorError { } } -impl From for DocumentsBatchCursorError { - fn from(error: str::Utf8Error) -> DocumentsBatchCursorError { - DocumentsBatchCursorError::Utf8(error) +impl From for DocumentsBatchCursorError { + fn from(error: serde_json::Error) -> DocumentsBatchCursorError { + DocumentsBatchCursorError::SerdeJson(error) } } @@ -116,7 +116,7 @@ impl fmt::Display for DocumentsBatchCursorError { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DocumentsBatchCursorError::Grenad(e) => e.fmt(f), - DocumentsBatchCursorError::Utf8(e) => e.fmt(f), + DocumentsBatchCursorError::SerdeJson(e) => e.fmt(f), } } } diff --git a/milli/src/error.rs b/milli/src/error.rs index 0419ceeda..0abb41eec 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -217,7 +217,7 @@ impl From for Error { fn from(error: DocumentsBatchCursorError) -> Error { match error { DocumentsBatchCursorError::Grenad(e) => Error::from(e), - DocumentsBatchCursorError::Utf8(e) => Error::from(e), + DocumentsBatchCursorError::SerdeJson(e) => Error::from(InternalError::from(e)), } } } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 5d00565a8..1a0c31c24 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -2,6 +2,7 @@ use std::io::{Read, Seek}; use std::result::Result as StdResult; use std::{fmt, iter}; +use serde::{Deserialize, Serialize}; use serde_json::Value; use crate::documents::{DocumentsBatchIndex, DocumentsBatchReader, EnrichedDocumentsBatchReader}; @@ -89,14 +90,15 @@ pub fn enrich_documents_batch( Err(user_error) => return Ok(Err(user_error)), }; - external_ids.insert(count.to_be_bytes(), document_id.value())?; - if let Some(geo_value) = geo_field_id.and_then(|fid| document.get(fid)) { if let Err(user_error) = validate_geo_from_json(&document_id, geo_value)? { return Ok(Err(UserError::from(user_error))); } } + let document_id = serde_json::to_vec(&document_id).map_err(InternalError::SerdeJson)?; + external_ids.insert(count.to_be_bytes(), document_id)?; + count += 1; } @@ -210,7 +212,7 @@ impl PrimaryKey<'_> { /// /// In case the document id has been auto-generated, the document nth is kept to help /// users debug if there is an issue with the document itself. -#[derive(Clone)] +#[derive(Serialize, Deserialize, Clone)] pub enum DocumentId { Retrieved { value: String }, Generated { value: String, document_nth: u32 }, @@ -225,16 +227,20 @@ impl DocumentId { DocumentId::Generated { value, document_nth } } - fn value(&self) -> &str { + fn debug(&self) -> String { + format!("{:?}", self) + } + + pub fn is_generated(&self) -> bool { + matches!(self, DocumentId::Generated { .. }) + } + + pub fn value(&self) -> &str { match self { DocumentId::Retrieved { value } => value, DocumentId::Generated { value, .. } => value, } } - - fn debug(&self) -> String { - format!("{:?}", self) - } } impl fmt::Debug for DocumentId { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index db1a768e6..615e1dfc7 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -22,7 +22,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{ extract_float_from_value, validate_document_id, validate_document_id_value, - validate_geo_from_json, + validate_geo_from_json, DocumentId, }; pub use self::helpers::{ as_cloneable_grenad, create_sorter, create_writer, fst_stream_into_hashset, diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index e82556ec7..a34295a50 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -153,8 +153,9 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let primary_key = cursor.primary_key().to_string(); - self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let primary_key_id_nested = primary_key.contains('.'); + let primary_key_id = + self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; let mut flattened_document = None; let mut obkv_buffer = Vec::new(); @@ -162,7 +163,7 @@ impl<'a, 'i> Transform<'a, 'i> { let mut documents_count = 0; let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); while let Some(enriched_document) = cursor.next_enriched_document()? { - let EnrichedDocument { document, external_id } = enriched_document; + let EnrichedDocument { document, document_id } = enriched_document; let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { @@ -171,6 +172,14 @@ impl<'a, 'i> Transform<'a, 'i> { }); } + // When the document id has been auto-generated by the `enrich_documents_batch` + // we must insert this document id into the remaped document. + let external_id = document_id.value(); + if document_id.is_generated() { + let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, Cow::from(docid))); + } + for (k, v) in document.iter() { let mapped_id = *mapping.get(&k).ok_or(InternalError::FieldIdMappingMissingEntry { key: k })?; diff --git a/milli/src/update/mod.rs b/milli/src/update/mod.rs index 965ed4fd2..1bf27a5f0 100644 --- a/milli/src/update/mod.rs +++ b/milli/src/update/mod.rs @@ -3,7 +3,7 @@ pub use self::clear_documents::ClearDocuments; pub use self::delete_documents::{DeleteDocuments, DocumentDeletionResult}; pub use self::facets::Facets; pub use self::index_documents::{ - DocumentAdditionResult, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, + DocumentAdditionResult, DocumentId, IndexDocuments, IndexDocumentsConfig, IndexDocumentsMethod, }; pub use self::indexer_config::IndexerConfig; pub use self::settings::{Setting, Settings}; From 5d149d631f7736443d5c871a3442bfd99dbaeb48 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Thu, 30 Jun 2022 15:13:50 +0200 Subject: [PATCH 26/35] Remove tests for a function that no more exists --- milli/src/update/index_documents/transform.rs | 85 ------------------- 1 file changed, 85 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index a34295a50..6bf3dde43 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -729,88 +729,3 @@ impl TransformOutput { .collect()) } } - -// #[cfg(test)] -// mod test { -// use super::*; - -// mod compute_primary_key { -// use big_s::S; - -// use super::{compute_primary_key_pair, FieldsIdsMap}; - -// #[test] -// fn should_return_primary_key_if_is_some() { -// let mut fields_map = FieldsIdsMap::new(); -// fields_map.insert("toto").unwrap(); -// let result = compute_primary_key_pair( -// Some("toto"), -// &mut fields_map, -// Some("tata".to_string()), -// false, -// ); -// assert_eq!(result.unwrap(), (0, "toto".to_string())); -// assert_eq!(fields_map.len(), 1); - -// // and with nested fields -// let mut fields_map = FieldsIdsMap::new(); -// fields_map.insert("toto.tata").unwrap(); -// let result = compute_primary_key_pair( -// Some("toto.tata"), -// &mut fields_map, -// Some(S("titi")), -// false, -// ); -// assert_eq!(result.unwrap(), (0, "toto.tata".to_string())); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_alternative_if_primary_is_none() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = -// compute_primary_key_pair(None, &mut fields_map, Some("tata".to_string()), false); -// assert_eq!(result.unwrap(), (0, S("tata"))); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_default_if_both_are_none() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = compute_primary_key_pair(None, &mut fields_map, None, true); -// assert_eq!(result.unwrap(), (0, S("id"))); -// assert_eq!(fields_map.len(), 1); -// } - -// #[test] -// fn should_return_err_if_both_are_none_and_recompute_is_false() { -// let mut fields_map = FieldsIdsMap::new(); -// let result = compute_primary_key_pair(None, &mut fields_map, None, false); -// assert!(result.is_err()); -// assert_eq!(fields_map.len(), 0); -// } -// } - -// mod primary_key_inference { -// use big_s::S; -// use bimap::BiHashMap; - -// use crate::documents::DocumentsBatchIndex; -// use crate::update::index_documents::transform::find_primary_key; - -// #[test] -// fn primary_key_infered_on_first_field() { -// // We run the test multiple times to change the order in which the fields are iterated upon. -// for _ in 1..50 { -// let mut map = BiHashMap::new(); -// map.insert(1, S("fakeId")); -// map.insert(2, S("fakeId")); -// map.insert(3, S("fakeId")); -// map.insert(4, S("fakeId")); -// map.insert(0, S("realId")); - -// assert_eq!(find_primary_key(&DocumentsBatchIndex(map)), Some("realId")); -// } -// } -// } -// } From 2eec290424b7508c3953be301e070028fdf900f7 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 16:36:23 +0200 Subject: [PATCH 27/35] Check the validity of the latitute and longitude numbers --- milli/src/error.rs | 6 ++--- milli/src/update/index_documents/enrich.rs | 24 ++++++++++++++----- .../extract/extract_geo_points.rs | 6 ++--- milli/src/update/index_documents/mod.rs | 2 +- 4 files changed, 25 insertions(+), 13 deletions(-) diff --git a/milli/src/error.rs b/milli/src/error.rs index 0abb41eec..80c923bd9 100644 --- a/milli/src/error.rs +++ b/milli/src/error.rs @@ -152,11 +152,11 @@ pub enum GeoError { MissingLatitude { document_id: Value }, #[error("Could not find longitude in the document with the id: `{document_id}`. Was expecting a `_geo.lng` field.")] MissingLongitude { document_id: Value }, - #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{lat}` and `{lng}`.")] + #[error("Could not parse latitude nor longitude in the document with the id: `{document_id}`. Was expecting finite numbers but instead got `{lat}` and `{lng}`.")] BadLatitudeAndLongitude { document_id: Value, lat: Value, lng: Value }, - #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] + #[error("Could not parse latitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] BadLatitude { document_id: Value, value: Value }, - #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a number but instead got `{value}`.")] + #[error("Could not parse longitude in the document with the id: `{document_id}`. Was expecting a finite number but instead got `{value}`.")] BadLongitude { document_id: Value, value: Value }, } diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 1a0c31c24..28318881b 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -326,11 +326,23 @@ pub fn validate_document_id_value(document_id: Value) -> Result StdResult { - match value { - Value::Number(ref n) => n.as_f64().ok_or(value), - Value::String(ref s) => s.parse::().map_err(|_| value), - value => Err(value), +pub fn extract_finite_float_from_value(value: Value) -> StdResult { + let number = match value { + Value::Number(ref n) => match n.as_f64() { + Some(number) => number, + None => return Err(value), + }, + Value::String(ref s) => match s.parse::() { + Ok(number) => number, + Err(_) => return Err(value), + }, + value => return Err(value), + }; + + if number.is_finite() { + Ok(number) + } else { + Err(value) } } @@ -340,7 +352,7 @@ pub fn validate_geo_from_json(id: &DocumentId, bytes: &[u8]) -> Result match (object.remove("lat"), object.remove("lng")) { (Some(lat), Some(lng)) => { - match (extract_float_from_value(lat), extract_float_from_value(lng)) { + match (extract_finite_float_from_value(lat), extract_finite_float_from_value(lng)) { (Ok(_), Ok(_)) => Ok(Ok(())), (Err(value), Ok(_)) => Ok(Err(BadLatitude { document_id: debug_id(), value })), (Ok(_), Err(value)) => Ok(Err(BadLongitude { document_id: debug_id(), value })), diff --git a/milli/src/update/index_documents/extract/extract_geo_points.rs b/milli/src/update/index_documents/extract/extract_geo_points.rs index 5a6de236b..47085144a 100644 --- a/milli/src/update/index_documents/extract/extract_geo_points.rs +++ b/milli/src/update/index_documents/extract/extract_geo_points.rs @@ -6,7 +6,7 @@ use serde_json::Value; use super::helpers::{create_writer, writer_into_reader, GrenadParameters}; use crate::error::GeoError; -use crate::update::index_documents::extract_float_from_value; +use crate::update::index_documents::extract_finite_float_from_value; use crate::{FieldId, InternalError, Result}; /// Extracts the geographical coordinates contained in each document under the `_geo` field. @@ -40,12 +40,12 @@ pub fn extract_geo_points( if let Some((lat, lng)) = lat.zip(lng) { // then we extract the values - let lat = extract_float_from_value( + let lat = extract_finite_float_from_value( serde_json::from_slice(lat).map_err(InternalError::SerdeJson)?, ) .map_err(|lat| GeoError::BadLatitude { document_id: document_id(), value: lat })?; - let lng = extract_float_from_value( + let lng = extract_finite_float_from_value( serde_json::from_slice(lng).map_err(InternalError::SerdeJson)?, ) .map_err(|lng| GeoError::BadLongitude { document_id: document_id(), value: lng })?; diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 615e1dfc7..652c1e72b 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -21,7 +21,7 @@ use typed_chunk::{write_typed_chunk_into_index, TypedChunk}; use self::enrich::enrich_documents_batch; pub use self::enrich::{ - extract_float_from_value, validate_document_id, validate_document_id_value, + extract_finite_float_from_value, validate_document_id, validate_document_id_value, validate_geo_from_json, DocumentId, }; pub use self::helpers::{ From dc61105554954801cce37cc4cca32b1ef4814347 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 17:44:08 +0200 Subject: [PATCH 28/35] Fix the nested document id fetching function --- milli/src/update/index_documents/enrich.rs | 4 +++- milli/src/update/index_documents/mod.rs | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 28318881b..0298f0532 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -204,7 +204,9 @@ impl PrimaryKey<'_> { /// can have depending of the first level name and deepnes of the objects. fn possible_level_names(&self) -> impl Iterator + '_ { let name = self.name(); - iter::successors(Some((name, "")), |(curr, _)| curr.rsplit_once(PRIMARY_KEY_SPLIT_SYMBOL)) + name.match_indices(PRIMARY_KEY_SPLIT_SYMBOL) + .map(move |(i, _)| (&name[..i], &name[i + PRIMARY_KEY_SPLIT_SYMBOL.len_utf8()..])) + .chain(iter::once((name, ""))) } } diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 652c1e72b..54599acce 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1198,7 +1198,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `"lol"`."# + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `"lol"`."# ); let documents = documents!([ @@ -1212,7 +1212,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse latitude in the document with the id: `0`. Was expecting a number but instead got `[12,13]`."# + r#"Could not parse latitude in the document with the id: `0`. Was expecting a finite number but instead got `[12,13]`."# ); let documents = documents!([ @@ -1226,7 +1226,7 @@ mod tests { let error = builder.execute().unwrap_err(); assert_eq!( &error.to_string(), - r#"Could not parse longitude in the document with the id: `0`. Was expecting a number but instead got `"hello"`."# + r#"Could not parse longitude in the document with the id: `0`. Was expecting a finite number but instead got `"hello"`."# ); } From a892a4a79c5933a2eedb04d6b46bd19d66229729 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 11 Jul 2022 18:38:50 +0200 Subject: [PATCH 29/35] Introduce a function to extend from a JSON array of objects --- benchmarks/benches/utils.rs | 5 +- cli/src/main.rs | 5 +- http-ui/src/main.rs | 5 +- milli/src/documents/builder.rs | 9 ++++ milli/src/documents/mod.rs | 1 + milli/src/documents/serde_impl.rs | 76 +++++++++++++++++++++++++++++++ 6 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 milli/src/documents/serde_impl.rs diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 630e17943..51178b43b 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -164,11 +164,8 @@ fn documents_from_jsonl(reader: impl BufRead) -> anyhow::Result> { fn documents_from_json(reader: impl BufRead) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/cli/src/main.rs b/cli/src/main.rs index 0d197af17..35fef95c6 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -337,11 +337,8 @@ fn documents_from_jsonl(reader: impl Read) -> Result> { fn documents_from_json(reader: impl Read) -> Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/http-ui/src/main.rs b/http-ui/src/main.rs index 117aa31e8..83fce9a9c 100644 --- a/http-ui/src/main.rs +++ b/http-ui/src/main.rs @@ -1042,11 +1042,8 @@ fn documents_from_jsonl(reader: impl Read) -> anyhow::Result> { fn documents_from_json(reader: impl Read) -> anyhow::Result> { let mut documents = DocumentsBatchBuilder::new(Vec::new()); - let list: Vec = serde_json::from_reader(reader)?; - for object in list { - documents.append_json_object(&object)?; - } + documents.append_json_array(reader)?; documents.into_inner().map_err(Into::into) } diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index 589e52269..bb9d6aa68 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -1,9 +1,11 @@ use std::io::{self, Write}; use grenad::{CompressionType, WriterBuilder}; +use serde::de::Deserializer; use serde_json::{to_writer, Value}; use super::{DocumentsBatchIndex, Error, DOCUMENTS_BATCH_INDEX_KEY}; +use crate::documents::serde_impl::DocumentVisitor; use crate::Object; /// The `DocumentsBatchBuilder` provides a way to build a documents batch in the intermediary @@ -78,6 +80,13 @@ impl DocumentsBatchBuilder { Ok(()) } + /// Appends a new JSON array of objects into the batch and updates the `DocumentsBatchIndex` accordingly. + pub fn append_json_array(&mut self, reader: R) -> Result<(), Error> { + let mut de = serde_json::Deserializer::from_reader(reader); + let mut visitor = DocumentVisitor::new(self); + de.deserialize_any(&mut visitor)? + } + /// Appends a new CSV file into the batch and updates the `DocumentsBatchIndex` accordingly. pub fn append_csv(&mut self, mut reader: csv::Reader) -> Result<(), Error> { // Make sure that we insert the fields ids in order as the obkv writer has this requirement. diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index 43bfc1c20..c5ff7a120 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -1,6 +1,7 @@ mod builder; mod enriched; mod reader; +mod serde_impl; use std::fmt::{self, Debug}; use std::io; diff --git a/milli/src/documents/serde_impl.rs b/milli/src/documents/serde_impl.rs new file mode 100644 index 000000000..d4abdc844 --- /dev/null +++ b/milli/src/documents/serde_impl.rs @@ -0,0 +1,76 @@ +use std::fmt; +use std::io::Write; + +use serde::de::{DeserializeSeed, MapAccess, SeqAccess, Visitor}; + +use super::Error; +use crate::documents::DocumentsBatchBuilder; +use crate::Object; + +macro_rules! tri { + ($e:expr) => { + match $e { + Ok(r) => r, + Err(e) => return Ok(Err(e.into())), + } + }; +} + +pub struct DocumentVisitor<'a, W> { + inner: &'a mut DocumentsBatchBuilder, + object: Object, +} + +impl<'a, W> DocumentVisitor<'a, W> { + pub fn new(inner: &'a mut DocumentsBatchBuilder) -> Self { + DocumentVisitor { inner, object: Object::new() } + } +} + +impl<'a, 'de, W: Write> Visitor<'de> for &mut DocumentVisitor<'a, W> { + /// This Visitor value is nothing, since it write the value to a file. + type Value = Result<(), Error>; + + fn visit_seq(self, mut seq: A) -> Result + where + A: SeqAccess<'de>, + { + while let Some(v) = seq.next_element_seed(&mut *self)? { + tri!(v) + } + + Ok(Ok(())) + } + + fn visit_map(self, mut map: A) -> Result + where + A: MapAccess<'de>, + { + self.object.clear(); + while let Some((key, value)) = map.next_entry()? { + self.object.insert(key, value); + } + + tri!(self.inner.append_json_object(&self.object)); + + Ok(Ok(())) + } + + fn expecting(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "a documents, or a sequence of documents.") + } +} + +impl<'a, 'de, W> DeserializeSeed<'de> for &mut DocumentVisitor<'a, W> +where + W: Write, +{ + type Value = Result<(), Error>; + + fn deserialize(self, deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_map(self) + } +} From 192793ee38c7d819b5b542c8c7d0f1ae473d9619 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 12:42:06 +0200 Subject: [PATCH 30/35] Add some tests to check for the nested documents ids --- milli/src/update/index_documents/enrich.rs | 9 ++-- milli/src/update/index_documents/mod.rs | 52 ++++++++++++++++++++++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 0298f0532..d7ab89faa 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -257,12 +257,9 @@ impl fmt::Debug for DocumentId { } fn contained_in(selector: &str, key: &str) -> bool { - selector.starts_with(key) - && selector[key.len()..] - .chars() - .next() - .map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL) - .unwrap_or(true) + selector.strip_prefix(key).map_or(false, |tail| { + tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) + }) } pub fn fetch_matching_values(value: Value, selector: &str, output: &mut Vec) { diff --git a/milli/src/update/index_documents/mod.rs b/milli/src/update/index_documents/mod.rs index 54599acce..c9890f93f 100644 --- a/milli/src/update/index_documents/mod.rs +++ b/milli/src/update/index_documents/mod.rs @@ -1627,6 +1627,58 @@ mod tests { assert_eq!(documents_ids, vec![3]); } + #[test] + fn retrieve_a_b_nested_document_id() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key("a.b".to_owned()); + builder.execute(|_| ()).unwrap(); + + let content = documents!({ "a" : { "b" : { "c" : 1 }}}); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + let (_builder, user_error) = builder.add_documents(content).unwrap(); + + // There must be an issue with the primary key no present in the given document + user_error.unwrap_err(); + } + + #[test] + fn retrieve_a_b_c_nested_document_id() { + let path = tempfile::tempdir().unwrap(); + let mut options = EnvOpenOptions::new(); + options.map_size(10 * 1024 * 1024); // 10 MB + let index = Index::new(options, &path).unwrap(); + let config = IndexerConfig::default(); + + let mut wtxn = index.write_txn().unwrap(); + let mut builder = update::Settings::new(&mut wtxn, &index, &config); + builder.set_primary_key("a.b.c".to_owned()); + builder.execute(|_| ()).unwrap(); + + let content = documents!({ "a" : { "b" : { "c" : 1 }}}); + let indexing_config = IndexDocumentsConfig::default(); + let builder = + IndexDocuments::new(&mut wtxn, &index, &config, indexing_config.clone(), |_| ()) + .unwrap(); + let (builder, user_error) = builder.add_documents(content).unwrap(); + user_error.unwrap(); + builder.execute().unwrap(); + wtxn.commit().unwrap(); + + let rtxn = index.read_txn().unwrap(); + let external_documents_ids = index.external_documents_ids(&rtxn).unwrap(); + assert!(external_documents_ids.get("1").is_some()); + } + #[test] fn test_facets_generation() { let path = tempfile::tempdir().unwrap(); From 25e768f31c32d8718132be124ab2e32c94c15f8f Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 14:41:33 +0200 Subject: [PATCH 31/35] Fix another issue with the nested primary key selector --- milli/src/update/index_documents/enrich.rs | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index d7ab89faa..56f8fa4c0 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -256,7 +256,7 @@ impl fmt::Debug for DocumentId { } } -fn contained_in(selector: &str, key: &str) -> bool { +fn starts_with(selector: &str, key: &str) -> bool { selector.strip_prefix(key).map_or(false, |tail| { tail.chars().next().map(|c| c == PRIMARY_KEY_SPLIT_SYMBOL).unwrap_or(true) }) @@ -282,12 +282,7 @@ pub fn fetch_matching_values_in_object( format!("{}{}{}", base_key, PRIMARY_KEY_SPLIT_SYMBOL, key) }; - // here if the user only specified `doggo` we need to iterate in all the fields of `doggo` - // so we check the contained_in on both side. - let should_continue = - contained_in(selector, &base_key) || contained_in(&base_key, selector); - - if should_continue { + if starts_with(selector, &base_key) { match value { Value::Object(object) => { fetch_matching_values_in_object(object, selector, &base_key, output) From 448114cc1c1edd1781a830590f2c888dcdab775d Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 12 Jul 2022 15:22:09 +0200 Subject: [PATCH 32/35] Fix the benchmarks with the new indexation API --- benchmarks/benches/indexing.rs | 40 +++++++++++++--------- benchmarks/benches/utils.rs | 4 +-- milli/src/update/index_documents/enrich.rs | 6 ++-- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/benchmarks/benches/indexing.rs b/benchmarks/benches/indexing.rs index 80c7ba0ed..81b21b5ea 100644 --- a/benchmarks/benches/indexing.rs +++ b/benchmarks/benches/indexing.rs @@ -170,12 +170,13 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -185,12 +186,13 @@ fn reindexing_songs_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_SONGS, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -460,12 +462,13 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -476,12 +479,13 @@ fn reindexing_wiki(c: &mut Criterion) { let indexing_config = IndexDocumentsConfig { autogenerate_docids: true, ..Default::default() }; let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_WIKI_ARTICLES, "csv"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -680,12 +684,13 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -695,12 +700,13 @@ fn reindexing_movies_default(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::MOVIES, "json"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1079,12 +1085,13 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); @@ -1095,12 +1102,13 @@ fn reindexing_geo(c: &mut Criterion) { let config = IndexerConfig::default(); let indexing_config = IndexDocumentsConfig::default(); let mut wtxn = index.write_txn().unwrap(); - let mut builder = + let builder = IndexDocuments::new(&mut wtxn, &index, &config, indexing_config, |_| ()) .unwrap(); let documents = utils::documents_from(datasets_paths::SMOL_ALL_COUNTRIES, "jsonl"); - builder.add_documents(documents).unwrap(); + let (builder, user_error) = builder.add_documents(documents).unwrap(); + user_error.unwrap(); builder.execute().unwrap(); wtxn.commit().unwrap(); diff --git a/benchmarks/benches/utils.rs b/benchmarks/benches/utils.rs index 51178b43b..fba05edbe 100644 --- a/benchmarks/benches/utils.rs +++ b/benchmarks/benches/utils.rs @@ -1,7 +1,7 @@ #![allow(dead_code)] use std::fs::{create_dir_all, remove_dir_all, File}; -use std::io::{self, BufReader, Cursor, Read, Seek}; +use std::io::{self, BufRead, BufReader, Cursor, Read, Seek}; use std::num::ParseFloatError; use std::path::Path; @@ -138,7 +138,7 @@ pub fn run_benches(c: &mut criterion::Criterion, confs: &[Conf]) { } } -pub fn documents_from(filename: &str, filetype: &str) -> DocumentBatchReader { +pub fn documents_from(filename: &str, filetype: &str) -> DocumentsBatchReader { let reader = File::open(filename).expect(&format!("could not find the dataset in: {}", filename)); let reader = BufReader::new(reader); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 56f8fa4c0..51495c598 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -30,7 +30,7 @@ pub fn enrich_documents_batch( let mut cursor = reader.into_cursor(); let mut documents_batch_index = cursor.documents_batch_index().clone(); let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; - let mut uuid_buffer = [0; uuid::adapter::Hyphenated::LENGTH]; + let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; // The primary key *field id* that has already been set for this index or the one // we will guess by searching for the first key that contains "id" as a substring. @@ -119,7 +119,7 @@ fn fetch_or_generate_document_id( documents_batch_index: &DocumentsBatchIndex, primary_key: PrimaryKey, autogenerate_docids: bool, - uuid_buffer: &mut [u8; uuid::adapter::Hyphenated::LENGTH], + uuid_buffer: &mut [u8; uuid::fmt::Hyphenated::LENGTH], count: u32, ) -> Result> { match primary_key { @@ -134,7 +134,7 @@ fn fetch_or_generate_document_id( } } None if autogenerate_docids => { - let uuid = uuid::Uuid::new_v4().to_hyphenated().encode_lower(uuid_buffer); + let uuid = uuid::Uuid::new_v4().as_hyphenated().encode_lower(uuid_buffer); Ok(Ok(DocumentId::generated(uuid.to_string(), count))) } None => Ok(Err(UserError::MissingDocumentId { From ab1571cdec615d025eeab8c924df74a5c485dceb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 12:41:58 +0200 Subject: [PATCH 33/35] Simplify Transform::read_documents, enabled by enriched documents reader --- milli/src/update/index_documents/transform.rs | 97 +++---------------- 1 file changed, 11 insertions(+), 86 deletions(-) diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index 6bf3dde43..d03a803fd 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -153,18 +153,18 @@ impl<'a, 'i> Transform<'a, 'i> { let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; let primary_key = cursor.primary_key().to_string(); - let primary_key_id_nested = primary_key.contains('.'); let primary_key_id = self.fields_ids_map.insert(&primary_key).ok_or(UserError::AttributeLimitReached)?; - let mut flattened_document = None; let mut obkv_buffer = Vec::new(); - let mut flattened_obkv_buffer = Vec::new(); let mut documents_count = 0; + let mut docid_buffer: Vec = Vec::new(); let mut field_buffer: Vec<(u16, Cow<[u8]>)> = Vec::new(); while let Some(enriched_document) = cursor.next_enriched_document()? { let EnrichedDocument { document, document_id } = enriched_document; + // drop_and_reuse is called instead of .clear() to communicate to the compiler that field_buffer + // does not keep references from the cursor between loop iterations let mut field_buffer_cache = drop_and_reuse(field_buffer); if self.indexer_settings.log_every_n.map_or(false, |len| documents_count % len == 0) { progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -176,8 +176,9 @@ impl<'a, 'i> Transform<'a, 'i> { // we must insert this document id into the remaped document. let external_id = document_id.value(); if document_id.is_generated() { - let docid = serde_json::to_vec(external_id).map_err(InternalError::SerdeJson)?; - field_buffer_cache.push((primary_key_id, Cow::from(docid))); + serde_json::to_writer(&mut docid_buffer, external_id) + .map_err(InternalError::SerdeJson)?; + field_buffer_cache.push((primary_key_id, Cow::from(&docid_buffer))); } for (k, v) in document.iter() { @@ -186,22 +187,6 @@ impl<'a, 'i> Transform<'a, 'i> { field_buffer_cache.push((mapped_id, Cow::from(v))); } - // We need to make sure that every document has a primary key. After we have remapped - // all the fields in the document, we try to find the primary key value. If we can find - // it, transform it into a string and validate it, and then update it in the - // document. If none is found, and we were told to generate missing document ids, then - // we create the missing field, and update the new document. - if primary_key_id_nested { - let mut field_buffer_cache = field_buffer_cache.clone(); - self.flatten_from_field_mapping( - &mapping, - &document, - &mut flattened_obkv_buffer, - &mut field_buffer_cache, - )?; - flattened_document = Some(&flattened_obkv_buffer); - }; - // Insertion in a obkv need to be done with keys ordered. For now they are ordered // according to the document addition key order, so we sort it according to the // fieldids map keys order. @@ -256,18 +241,12 @@ impl<'a, 'i> Transform<'a, 'i> { } // We use the extracted/generated user id as the key for this document. - self.original_sorter.insert(&docid.to_be_bytes(), obkv_buffer.clone())?; + self.original_sorter.insert(&docid.to_be_bytes(), &obkv_buffer)?; documents_count += 1; - if let Some(flatten) = flattened_document { - self.flattened_sorter.insert(docid.to_be_bytes(), &flatten)?; - } else { - match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { - Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, - None => { - self.flattened_sorter.insert(docid.to_be_bytes(), obkv_buffer.clone())? - } - } + match self.flatten_from_fields_ids_map(KvReader::new(&obkv_buffer))? { + Some(buffer) => self.flattened_sorter.insert(docid.to_be_bytes(), &buffer)?, + None => self.flattened_sorter.insert(docid.to_be_bytes(), &obkv_buffer)?, } progress_callback(UpdateIndexingStep::RemapDocumentAddition { @@ -275,6 +254,7 @@ impl<'a, 'i> Transform<'a, 'i> { }); field_buffer = drop_and_reuse(field_buffer_cache); + docid_buffer.clear(); obkv_buffer.clear(); } @@ -345,61 +325,6 @@ impl<'a, 'i> Transform<'a, 'i> { Ok(Some(buffer)) } - // Flatten a document from a field mapping generated by [create_fields_mapping] - fn flatten_from_field_mapping( - &mut self, - mapping: &HashMap, - obkv: &KvReader, - output_buffer: &mut Vec, - field_buffer_cache: &mut Vec<(u16, Cow<[u8]>)>, - ) -> Result<()> { - // store the keys and values of the json + the original obkv - let mut key_value: Vec<(FieldId, Cow<[u8]>)> = Vec::new(); - - // if the primary_key is nested we need to flatten the document before being able to do anything - let mut doc = serde_json::Map::new(); - - // we recreate a json containing only the fields that needs to be flattened. - // all the raw values get inserted directly in the `key_value` vec. - for (key, value) in obkv.iter() { - if json_depth_checker::should_flatten_from_unchecked_slice(value) { - let key = - mapping.get(&key).ok_or(InternalError::FieldIdMappingMissingEntry { key })?; - let key = - self.fields_ids_map.name(*key).ok_or(FieldIdMapMissingEntry::FieldId { - field_id: *key, - process: "Flatten from field mapping.", - })?; - let value = serde_json::from_slice::(value) - .map_err(InternalError::SerdeJson)?; - doc.insert(key.to_string(), value); - } else { - key_value.push((key, value.into())); - } - } - - let flattened = flatten_serde_json::flatten(&doc); - - // Once we have the flattened version we insert all the new generated fields_ids - // (if any) in the fields ids map and serialize the value. - for (key, value) in flattened.into_iter() { - let fid = self.fields_ids_map.insert(&key).ok_or(UserError::AttributeLimitReached)?; - let value = serde_json::to_vec(&value).map_err(InternalError::SerdeJson)?; - key_value.push((fid, value.clone().into())); - - if field_buffer_cache.iter().find(|(id, _)| *id == fid).is_none() { - field_buffer_cache.push((fid, value.into())); - } - } - - // we sort the key. If there was a conflict between the obkv and the new generated value the - // keys will be consecutive. - key_value.sort_unstable_by_key(|(key, _)| *key); - - Self::create_obkv_from_key_value(&mut key_value, output_buffer)?; - Ok(()) - } - /// Generate an obkv from a slice of key / value sorted by key. fn create_obkv_from_key_value( key_value: &mut [(FieldId, Cow<[u8]>)], From fc9f3f31e74edda92ad1beb31226e9cb100781e1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Mon, 18 Jul 2022 16:08:01 +0200 Subject: [PATCH 34/35] Change DocumentsBatchReader to access cursor and index at same time Otherwise it is not possible to iterate over all documents while using the fields index at the same time. --- milli/src/documents/builder.rs | 71 ++++++++++--------- milli/src/documents/enriched.rs | 25 ++----- milli/src/documents/mod.rs | 16 +++-- milli/src/documents/reader.rs | 20 ++---- milli/src/update/index_documents/enrich.rs | 9 +-- milli/src/update/index_documents/transform.rs | 6 +- 6 files changed, 65 insertions(+), 82 deletions(-) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index bb9d6aa68..dc027e1b7 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -216,9 +216,9 @@ mod test { assert_eq!(builder.documents_count(), 2); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert_eq!(index.len(), 3); let document = cursor.next_document().unwrap().unwrap(); @@ -240,9 +240,9 @@ mod test { assert_eq!(builder.documents_count(), 2); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert_eq!(index.len(), 2); let document = cursor.next_document().unwrap().unwrap(); @@ -264,9 +264,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -292,9 +292,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -319,9 +319,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -346,9 +346,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -373,9 +373,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -400,9 +400,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -427,9 +427,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -454,9 +454,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); @@ -482,8 +482,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let (mut cursor, _) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); assert!(cursor.next_document().is_err()); } @@ -498,9 +499,9 @@ mod test { builder.append_csv(csv).unwrap(); let vector = builder.into_inner().unwrap(); - let mut cursor = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); - let index = cursor.documents_batch_index().clone(); + let (mut cursor, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let val = obkv_to_object(&doc, &index).map(Value::from).unwrap(); diff --git a/milli/src/documents/enriched.rs b/milli/src/documents/enriched.rs index 4f45a891a..fa21c0f87 100644 --- a/milli/src/documents/enriched.rs +++ b/milli/src/documents/enriched.rs @@ -56,14 +56,13 @@ impl EnrichedDocumentsBatchReader { } /// This method returns a forward cursor over the enriched documents. - pub fn into_cursor(self) -> EnrichedDocumentsBatchCursor { + pub fn into_cursor_and_fields_index( + self, + ) -> (EnrichedDocumentsBatchCursor, DocumentsBatchIndex) { let EnrichedDocumentsBatchReader { documents, primary_key, mut external_ids } = self; + let (documents, fields_index) = documents.into_cursor_and_fields_index(); external_ids.reset(); - EnrichedDocumentsBatchCursor { - documents: documents.into_cursor(), - primary_key, - external_ids, - } + (EnrichedDocumentsBatchCursor { documents, primary_key, external_ids }, fields_index) } } @@ -80,23 +79,9 @@ pub struct EnrichedDocumentsBatchCursor { } impl EnrichedDocumentsBatchCursor { - pub fn into_reader(self) -> EnrichedDocumentsBatchReader { - let EnrichedDocumentsBatchCursor { documents, primary_key, external_ids } = self; - EnrichedDocumentsBatchReader { - documents: documents.into_reader(), - primary_key, - external_ids, - } - } - pub fn primary_key(&self) -> &str { &self.primary_key } - - pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { - self.documents.documents_batch_index() - } - /// Resets the cursor to be able to read from the start again. pub fn reset(&mut self) { self.documents.reset(); diff --git a/milli/src/documents/mod.rs b/milli/src/documents/mod.rs index c5ff7a120..e766e29cf 100644 --- a/milli/src/documents/mod.rs +++ b/milli/src/documents/mod.rs @@ -203,10 +203,11 @@ mod test { builder.append_json_object(value.as_object().unwrap()).unwrap(); let vector = builder.into_inner().unwrap(); - let mut documents = - DocumentsBatchReader::from_reader(Cursor::new(vector)).unwrap().into_cursor(); + let (mut documents, index) = DocumentsBatchReader::from_reader(Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); - assert_eq!(documents.documents_batch_index().iter().count(), 5); + assert_eq!(index.iter().count(), 5); let reader = documents.next_document().unwrap().unwrap(); assert_eq!(reader.iter().count(), 5); assert!(documents.next_document().unwrap().is_none()); @@ -226,9 +227,10 @@ mod test { builder.append_json_object(doc2.as_object().unwrap()).unwrap(); let vector = builder.into_inner().unwrap(); - let mut documents = - DocumentsBatchReader::from_reader(io::Cursor::new(vector)).unwrap().into_cursor(); - assert_eq!(documents.documents_batch_index().iter().count(), 2); + let (mut documents, index) = DocumentsBatchReader::from_reader(io::Cursor::new(vector)) + .unwrap() + .into_cursor_and_fields_index(); + assert_eq!(index.iter().count(), 2); let reader = documents.next_document().unwrap().unwrap(); assert_eq!(reader.iter().count(), 1); assert!(documents.next_document().unwrap().is_some()); @@ -243,7 +245,7 @@ mod test { } }]); - let mut cursor = docs_reader.into_cursor(); + let (mut cursor, _) = docs_reader.into_cursor_and_fields_index(); let doc = cursor.next_document().unwrap().unwrap(); let nested: Value = serde_json::from_slice(doc.get(0).unwrap()).unwrap(); assert_eq!(nested, json!({ "toto": ["hello"] })); diff --git a/milli/src/documents/reader.rs b/milli/src/documents/reader.rs index 70b8b0131..a8a4c662d 100644 --- a/milli/src/documents/reader.rs +++ b/milli/src/documents/reader.rs @@ -17,6 +17,10 @@ pub struct DocumentsBatchReader { } impl DocumentsBatchReader { + pub fn new(cursor: DocumentsBatchCursor, fields_index: DocumentsBatchIndex) -> Self { + Self { cursor: cursor.cursor, fields_index } + } + /// Construct a `DocumentsReader` from a reader. /// /// It first retrieves the index, then moves to the first document. Use the `into_cursor` @@ -46,30 +50,20 @@ impl DocumentsBatchReader { } /// This method returns a forward cursor over the documents. - pub fn into_cursor(self) -> DocumentsBatchCursor { + pub fn into_cursor_and_fields_index(self) -> (DocumentsBatchCursor, DocumentsBatchIndex) { let DocumentsBatchReader { cursor, fields_index } = self; - let mut cursor = DocumentsBatchCursor { cursor, fields_index }; + let mut cursor = DocumentsBatchCursor { cursor }; cursor.reset(); - cursor + (cursor, fields_index) } } /// A forward cursor over the documents in a `DocumentsBatchReader`. pub struct DocumentsBatchCursor { cursor: grenad::ReaderCursor, - fields_index: DocumentsBatchIndex, } impl DocumentsBatchCursor { - pub fn into_reader(self) -> DocumentsBatchReader { - let DocumentsBatchCursor { cursor, fields_index, .. } = self; - DocumentsBatchReader { cursor, fields_index } - } - - pub fn documents_batch_index(&self) -> &DocumentsBatchIndex { - &self.fields_index - } - /// Resets the cursor to be able to read from the start again. pub fn reset(&mut self) { self.cursor.reset(); diff --git a/milli/src/update/index_documents/enrich.rs b/milli/src/update/index_documents/enrich.rs index 51495c598..7c9a016d8 100644 --- a/milli/src/update/index_documents/enrich.rs +++ b/milli/src/update/index_documents/enrich.rs @@ -27,8 +27,8 @@ pub fn enrich_documents_batch( autogenerate_docids: bool, reader: DocumentsBatchReader, ) -> Result, UserError>> { - let mut cursor = reader.into_cursor(); - let mut documents_batch_index = cursor.documents_batch_index().clone(); + let (mut cursor, mut documents_batch_index) = reader.into_cursor_and_fields_index(); + let mut external_ids = tempfile::tempfile().map(grenad::Writer::new)?; let mut uuid_buffer = [0; uuid::fmt::Hyphenated::LENGTH]; @@ -103,9 +103,10 @@ pub fn enrich_documents_batch( } let external_ids = writer_into_reader(external_ids)?; + let primary_key_name = primary_key.name().to_string(); let reader = EnrichedDocumentsBatchReader::new( - cursor.into_reader(), - primary_key.name().to_string(), + DocumentsBatchReader::new(cursor, documents_batch_index), + primary_key_name, external_ids, )?; diff --git a/milli/src/update/index_documents/transform.rs b/milli/src/update/index_documents/transform.rs index d03a803fd..0de90924a 100644 --- a/milli/src/update/index_documents/transform.rs +++ b/milli/src/update/index_documents/transform.rs @@ -146,11 +146,11 @@ impl<'a, 'i> Transform<'a, 'i> { R: Read + Seek, F: Fn(UpdateIndexingStep) + Sync, { - let mut cursor = reader.into_cursor(); - let fields_index = cursor.documents_batch_index(); + let (mut cursor, fields_index) = reader.into_cursor_and_fields_index(); + let external_documents_ids = self.index.external_documents_ids(wtxn)?; - let mapping = create_fields_mapping(&mut self.fields_ids_map, fields_index)?; + let mapping = create_fields_mapping(&mut self.fields_ids_map, &fields_index)?; let primary_key = cursor.primary_key().to_string(); let primary_key_id = From 41a0ce07cb5f07b886df99ddbf1889164486b0a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Lecrenier?= Date: Wed, 20 Jul 2022 16:20:35 +0200 Subject: [PATCH 35/35] Add a code comment, as suggested in PR review Co-authored-by: Many the fish --- milli/src/documents/builder.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/milli/src/documents/builder.rs b/milli/src/documents/builder.rs index dc027e1b7..1a57db34b 100644 --- a/milli/src/documents/builder.rs +++ b/milli/src/documents/builder.rs @@ -97,6 +97,7 @@ impl DocumentsBatchBuilder { .map(|(k, t)| (self.fields_index.insert(k), t)) .enumerate() .collect(); + // Make sure that we insert the fields ids in order as the obkv writer has this requirement. typed_fields_ids.sort_unstable_by_key(|(_, (fid, _))| *fid); let mut record = csv::StringRecord::new();