From 615825b9fd856149ce4a17834753ba61c1587856 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 18 May 2020 10:56:24 +0200 Subject: [PATCH 01/10] Remove the serde Serializer --- meilisearch-core/benches/search_benchmark.rs | 6 +- meilisearch-core/src/serde/mod.rs | 2 - meilisearch-core/src/serde/serializer.rs | 361 ------------------ .../src/update/documents_addition.rs | 82 ++-- 4 files changed, 57 insertions(+), 394 deletions(-) delete mode 100644 meilisearch-core/src/serde/serializer.rs diff --git a/meilisearch-core/benches/search_benchmark.rs b/meilisearch-core/benches/search_benchmark.rs index e2b0c0f5a..dc4c7cdc1 100644 --- a/meilisearch-core/benches/search_benchmark.rs +++ b/meilisearch-core/benches/search_benchmark.rs @@ -8,7 +8,7 @@ use std::fs::File; use std::io::BufReader; use std::iter; -use meilisearch_core::Database; +use meilisearch_core::{Database, DatabaseOptions}; use meilisearch_core::{ProcessedUpdateResult, UpdateStatus}; use meilisearch_core::settings::{Settings, SettingsUpdate}; use meilisearch_schema::Schema; @@ -17,7 +17,7 @@ use serde_json::Value; use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId}; fn prepare_database(path: &Path) -> Database { - let database = Database::open_or_create(path).unwrap(); + let database = Database::open_or_create(path, DatabaseOptions::default()).unwrap(); let db = &database; let (sender, receiver) = mpsc::sync_channel(100); @@ -27,7 +27,7 @@ fn prepare_database(path: &Path) -> Database { let index = database.create_index("bench").unwrap(); database.set_update_callback(Box::new(update_fn)); - + let mut writer = db.main_write_txn().unwrap(); index.main.put_schema(&mut writer, &Schema::with_primary_key("id")).unwrap(); writer.commit().unwrap(); diff --git a/meilisearch-core/src/serde/mod.rs b/meilisearch-core/src/serde/mod.rs index 46352c0f5..9cb8e50bc 100644 --- a/meilisearch-core/src/serde/mod.rs +++ b/meilisearch-core/src/serde/mod.rs @@ -13,14 +13,12 @@ mod convert_to_string; mod deserializer; mod extract_document_id; mod indexer; -mod serializer; pub use self::convert_to_number::ConvertToNumber; pub use self::convert_to_string::ConvertToString; pub use self::deserializer::{Deserializer, DeserializerError}; pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; pub use self::indexer::Indexer; -pub use self::serializer::{serialize_value, serialize_value_with_id, Serializer}; use std::{error::Error, fmt}; diff --git a/meilisearch-core/src/serde/serializer.rs b/meilisearch-core/src/serde/serializer.rs deleted file mode 100644 index 6142e96e7..000000000 --- a/meilisearch-core/src/serde/serializer.rs +++ /dev/null @@ -1,361 +0,0 @@ -use meilisearch_schema::{Schema, FieldId}; -use serde::ser; - -use crate::database::MainT; -use crate::raw_indexer::RawIndexer; -use crate::store::{DocumentsFields, DocumentsFieldsCounts}; -use crate::{DocumentId, RankedMap}; - -use super::{ConvertToNumber, ConvertToString, Indexer, SerializerError}; - -pub struct Serializer<'a, 'b> { - pub txn: &'a mut heed::RwTxn<'b, MainT>, - pub schema: &'a mut Schema, - pub document_store: DocumentsFields, - pub document_fields_counts: DocumentsFieldsCounts, - pub indexer: &'a mut RawIndexer, - pub ranked_map: &'a mut RankedMap, - pub document_id: DocumentId, -} - -impl<'a, 'b> ser::Serializer for Serializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapSerializer<'a, 'b>; - type SerializeStruct = StructSerializer<'a, 'b>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { type_name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { type_name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { - type_name: "Option", - }) - } - - fn serialize_some(self, _value: &T) -> Result - where - T: ser::Serialize, - { - Err(SerializerError::UnserializableType { - type_name: "Option", - }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { type_name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { - type_name: "unit struct", - }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "unit variant", - }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T, - ) -> Result - where - T: ser::Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: ser::Serialize, - { - Err(SerializerError::UnserializableType { - type_name: "newtype variant", - }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { - type_name: "sequence", - }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { type_name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "tuple struct", - }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "tuple variant", - }) - } - - fn serialize_map(self, _len: Option) -> Result { - Ok(MapSerializer { - txn: self.txn, - schema: self.schema, - document_id: self.document_id, - document_store: self.document_store, - document_fields_counts: self.document_fields_counts, - indexer: self.indexer, - ranked_map: self.ranked_map, - current_key_name: None, - }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Ok(StructSerializer { - txn: self.txn, - schema: self.schema, - document_id: self.document_id, - document_store: self.document_store, - document_fields_counts: self.document_fields_counts, - indexer: self.indexer, - ranked_map: self.ranked_map, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "struct variant", - }) - } -} - -pub struct MapSerializer<'a, 'b> { - txn: &'a mut heed::RwTxn<'b, MainT>, - schema: &'a mut Schema, - document_id: DocumentId, - document_store: DocumentsFields, - document_fields_counts: DocumentsFieldsCounts, - indexer: &'a mut RawIndexer, - ranked_map: &'a mut RankedMap, - current_key_name: Option, -} - -impl<'a, 'b> ser::SerializeMap for MapSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let key = key.serialize(ConvertToString)?; - self.current_key_name = Some(key); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let key = self.current_key_name.take().unwrap(); - self.serialize_entry(&key, value) - } - - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where - K: ser::Serialize, - V: ser::Serialize, - { - let key = key.serialize(ConvertToString)?; - serialize_value( - self.txn, - key.as_str(), - self.schema, - self.document_id, - self.document_store, - self.document_fields_counts, - self.indexer, - self.ranked_map, - value, - ) - } - - fn end(self) -> Result { - Ok(()) - } -} - -pub struct StructSerializer<'a, 'b> { - txn: &'a mut heed::RwTxn<'b, MainT>, - schema: &'a mut Schema, - document_id: DocumentId, - document_store: DocumentsFields, - document_fields_counts: DocumentsFieldsCounts, - indexer: &'a mut RawIndexer, - ranked_map: &'a mut RankedMap, -} - -impl<'a, 'b> ser::SerializeStruct for StructSerializer<'a, 'b> { - type Ok = (); - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T, - ) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - serialize_value( - self.txn, - key, - self.schema, - self.document_id, - self.document_store, - self.document_fields_counts, - self.indexer, - self.ranked_map, - value, - ) - } - - fn end(self) -> Result { - Ok(()) - } -} - -pub fn serialize_value<'a, T: ?Sized>( - txn: &mut heed::RwTxn, - attribute: &str, - schema: &'a mut Schema, - document_id: DocumentId, - document_store: DocumentsFields, - documents_fields_counts: DocumentsFieldsCounts, - indexer: &mut RawIndexer, - ranked_map: &mut RankedMap, - value: &T, -) -> Result<(), SerializerError> -where - T: ser::Serialize, -{ - let field_id = schema.insert_and_index(&attribute)?; - serialize_value_with_id( - txn, - field_id, - schema, - document_id, - document_store, - documents_fields_counts, - indexer, - ranked_map, - value, - ) -} - -pub fn serialize_value_with_id<'a, T: ?Sized>( - txn: &mut heed::RwTxn, - field_id: FieldId, - schema: &'a Schema, - document_id: DocumentId, - document_store: DocumentsFields, - documents_fields_counts: DocumentsFieldsCounts, - indexer: &mut RawIndexer, - ranked_map: &mut RankedMap, - value: &T, -) -> Result<(), SerializerError> -where - T: ser::Serialize, -{ - let serialized = serde_json::to_vec(value)?; - document_store.put_document_field(txn, document_id, field_id, &serialized)?; - - if let Some(indexed_pos) = schema.is_indexed(field_id) { - let indexer = Indexer { - pos: *indexed_pos, - indexer, - document_id, - }; - if let Some(number_of_words) = value.serialize(indexer)? { - documents_fields_counts.put_document_field_count( - txn, - document_id, - *indexed_pos, - number_of_words as u16, - )?; - } - } - - if schema.is_ranked(field_id) { - let number = value.serialize(ConvertToNumber).unwrap_or_default(); - ranked_map.insert(document_id, field_id, number); - } - - Ok(()) -} diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index d8f1f53f1..c65cf6e81 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -4,12 +4,14 @@ use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; use sdset::{duo::Union, SetOperation}; use serde::{Deserialize, Serialize}; +use serde_json::Value; use crate::database::{MainT, UpdateT}; use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; -use crate::serde::{extract_document_id, serialize_value_with_id, Deserializer, Serializer}; +use crate::serde::{extract_document_id, Deserializer}; +use crate::serde::{ConvertToNumber, Indexer}; use crate::store; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::{Error, MResult, RankedMap}; @@ -107,7 +109,7 @@ pub fn push_documents_addition( pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, - addition: Vec>, + addition: Vec>, partial: bool ) -> MResult<()> { let mut documents_additions = HashMap::new(); @@ -137,7 +139,7 @@ pub fn apply_addition<'a, 'b>( // retrieve the old document and // update the new one with missing keys found in the old one - let result = Option::>::deserialize(&mut deserializer)?; + let result = Option::>::deserialize(&mut deserializer)?; if let Some(old_document) = result { for (key, value) in old_document { document.entry(key).or_insert(value); @@ -170,18 +172,33 @@ pub fn apply_addition<'a, 'b>( let mut indexer = RawIndexer::new(stop_words); + // For each document in this update for (document_id, document) in documents_additions { - let serializer = Serializer { - txn: writer, - schema: &mut schema, - document_store: index.documents_fields, - document_fields_counts: index.documents_fields_counts, - indexer: &mut indexer, - ranked_map: &mut ranked_map, - document_id, - }; - document.serialize(serializer)?; + // For each key-value pair in the document. + for (attribute, value) in document { + + let field_id = schema.insert_and_index(&attribute)?; + let serialized = serde_json::to_vec(&value)?; + index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; + + if let Some(indexed_pos) = schema.is_indexed(field_id) { + let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; + if let Some(number_of_words) = value.serialize(indexer)? { + index.documents_fields_counts.put_document_field_count( + writer, + document_id, + *indexed_pos, + number_of_words as u16, + )?; + } + } + + if schema.is_ranked(field_id) { + let number = value.serialize(ConvertToNumber).unwrap_or_default(); + ranked_map.insert(document_id, field_id, number); + } + } } write_documents_addition_index( @@ -200,7 +217,7 @@ pub fn apply_addition<'a, 'b>( pub fn apply_documents_partial_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, - addition: Vec>, + addition: Vec>, ) -> MResult<()> { apply_addition(writer, index, addition, true) } @@ -208,7 +225,7 @@ pub fn apply_documents_partial_addition<'a, 'b>( pub fn apply_documents_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, - addition: Vec>, + addition: Vec>, ) -> MResult<()> { apply_addition(writer, index, addition, false) } @@ -253,22 +270,31 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind for document_id in documents_ids_to_reindex { for result in index.documents_fields.document_fields(writer, document_id)? { let (field_id, bytes) = result?; - let value: serde_json::Value = serde_json::from_slice(bytes)?; + let value: Value = serde_json::from_slice(bytes)?; ram_store.insert((document_id, field_id), value); } - for ((docid, field_id), value) in ram_store.drain() { - serialize_value_with_id( - writer, - field_id, - &schema, - docid, - index.documents_fields, - index.documents_fields_counts, - &mut indexer, - &mut ranked_map, - &value - )?; + // For each key-value pair in the document. + for ((document_id, field_id), value) in ram_store.drain() { + let serialized = serde_json::to_vec(&value)?; + index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; + + if let Some(indexed_pos) = schema.is_indexed(field_id) { + let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; + if let Some(number_of_words) = value.serialize(indexer)? { + index.documents_fields_counts.put_document_field_count( + writer, + document_id, + *indexed_pos, + number_of_words as u16, + )?; + } + } + + if schema.is_ranked(field_id) { + let number = value.serialize(ConvertToNumber).unwrap_or_default(); + ranked_map.insert(document_id, field_id, number); + } } } From 5e063da14fc76dffe08cd138c0f38226d80b2baa Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 18 May 2020 12:22:06 +0200 Subject: [PATCH 02/10] Remove the serde Indexer --- meilisearch-core/Cargo.toml | 2 +- meilisearch-core/src/serde/indexer.rs | 362 ------------------ meilisearch-core/src/serde/mod.rs | 2 - .../src/update/documents_addition.rs | 78 +++- 4 files changed, 74 insertions(+), 370 deletions(-) delete mode 100644 meilisearch-core/src/serde/indexer.rs diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 1706b9af6..d9b88f89b 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -34,7 +34,7 @@ pest_derive = "2.0" regex = "1.3.6" sdset = "0.4.0" serde = { version = "1.0.105", features = ["derive"] } -serde_json = "1.0.50" +serde_json = { version = "1.0.50", features = ["preserve_order"] } siphasher = "0.3.2" slice-group-by = "0.2.6" unicase = "2.6.0" diff --git a/meilisearch-core/src/serde/indexer.rs b/meilisearch-core/src/serde/indexer.rs deleted file mode 100644 index c8b6abeaf..000000000 --- a/meilisearch-core/src/serde/indexer.rs +++ /dev/null @@ -1,362 +0,0 @@ -use meilisearch_schema::IndexedPos; -use serde::ser; -use serde::Serialize; - -use super::{ConvertToString, SerializerError}; -use crate::raw_indexer::RawIndexer; -use crate::DocumentId; - -pub struct Indexer<'a> { - pub pos: IndexedPos, - pub indexer: &'a mut RawIndexer, - pub document_id: DocumentId, -} - -impl<'a> ser::Serializer for Indexer<'a> { - type Ok = Option; - type Error = SerializerError; - type SerializeSeq = SeqIndexer<'a>; - type SerializeTuple = TupleIndexer<'a>; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapIndexer<'a>; - type SerializeStruct = StructIndexer<'a>; - type SerializeStructVariant = ser::Impossible; - - fn serialize_bool(self, _value: bool) -> Result { - Ok(None) - } - - fn serialize_char(self, value: char) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i8(self, value: i8) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i16(self, value: i16) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i32(self, value: i32) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i64(self, value: i64) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u8(self, value: u8) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u16(self, value: u16) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u32(self, value: u32) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u64(self, value: u64) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_f32(self, value: f32) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_f64(self, value: f64) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_str(self, text: &str) -> Result { - let number_of_words = self - .indexer - .index_text(self.document_id, self.pos, text); - Ok(Some(number_of_words)) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnindexableType { type_name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Ok(None) - } - - fn serialize_some(self, value: &T) -> Result - where - T: ser::Serialize, - { - let text = value.serialize(ConvertToString)?; - let number_of_words = self - .indexer - .index_text(self.document_id, self.pos, &text); - Ok(Some(number_of_words)) - } - - fn serialize_unit(self) -> Result { - Ok(None) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Ok(None) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Ok(None) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T, - ) -> Result - where - T: ser::Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: ser::Serialize, - { - Err(SerializerError::UnindexableType { - type_name: "newtype variant", - }) - } - - fn serialize_seq(self, _len: Option) -> Result { - let indexer = SeqIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_tuple(self, _len: usize) -> Result { - let indexer = TupleIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnindexableType { - type_name: "tuple struct", - }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnindexableType { - type_name: "tuple variant", - }) - } - - fn serialize_map(self, _len: Option) -> Result { - let indexer = MapIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - let indexer = StructIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnindexableType { - type_name: "struct variant", - }) - } -} - -pub struct SeqIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeSeq for SeqIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = value.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} - -pub struct MapIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeMap for MapIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = key.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = value.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} - -pub struct StructIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeStruct for StructIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T, - ) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let key_text = key.to_owned(); - let value_text = value.serialize(ConvertToString)?; - self.texts.push(key_text); - self.texts.push(value_text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} - -pub struct TupleIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeTuple for TupleIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - let text = value.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} diff --git a/meilisearch-core/src/serde/mod.rs b/meilisearch-core/src/serde/mod.rs index 9cb8e50bc..6a1e51a09 100644 --- a/meilisearch-core/src/serde/mod.rs +++ b/meilisearch-core/src/serde/mod.rs @@ -12,13 +12,11 @@ mod convert_to_number; mod convert_to_string; mod deserializer; mod extract_document_id; -mod indexer; pub use self::convert_to_number::ConvertToNumber; pub use self::convert_to_string::ConvertToString; pub use self::deserializer::{Deserializer, DeserializerError}; pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; -pub use self::indexer::Indexer; use std::{error::Error, fmt}; diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index c65cf6e81..b7e6395af 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,4 +1,6 @@ use std::collections::HashMap; +use std::fmt::Write as _; +use std::fmt; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; @@ -6,12 +8,15 @@ use sdset::{duo::Union, SetOperation}; use serde::{Deserialize, Serialize}; use serde_json::Value; +use meilisearch_types::DocumentId; +use meilisearch_schema::IndexedPos; + use crate::database::{MainT, UpdateT}; use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; use crate::serde::{extract_document_id, Deserializer}; -use crate::serde::{ConvertToNumber, Indexer}; +use crate::serde::ConvertToNumber; use crate::store; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::{Error, MResult, RankedMap}; @@ -106,6 +111,69 @@ pub fn push_documents_addition( Ok(last_update_id) } +// TODO move this helper functions elsewhere +/// Returns the number of words indexed or `None` if the type +fn index_value( + indexer: &mut RawIndexer, + document_id: DocumentId, + indexed_pos: IndexedPos, + value: &Value, +) -> Option +{ + fn value_to_string(string: &mut String, value: &Value) { + match value { + Value::Null => (), + Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); }, + Value::Number(number) => { let _ = write!(string, "{}", &number); }, + Value::String(text) => string.push_str(&text), + Value::Array(array) => { + for value in array { + value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + Value::Object(object) => { + for (key, value) in object { + string.push_str(key); + let _ = string.write_str(". "); + value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + } + } + + match value { + Value::Null => None, + Value::Bool(boolean) => { + let text = boolean.to_string(); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + Value::Number(number) => { + let text = number.to_string(); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + Value::String(string) => { + let number_of_words = indexer.index_text(document_id, indexed_pos, &string); + Some(number_of_words) + }, + Value::Array(_) => { + let mut text = String::new(); + value_to_string(&mut text, value); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + Value::Object(_) => { + let mut text = String::new(); + value_to_string(&mut text, value); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + } +} + pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, @@ -183,8 +251,8 @@ pub fn apply_addition<'a, 'b>( index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; if let Some(indexed_pos) = schema.is_indexed(field_id) { - let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; - if let Some(number_of_words) = value.serialize(indexer)? { + let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value); + if let Some(number_of_words) = number_of_words { index.documents_fields_counts.put_document_field_count( writer, document_id, @@ -280,8 +348,8 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; if let Some(indexed_pos) = schema.is_indexed(field_id) { - let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; - if let Some(number_of_words) = value.serialize(indexer)? { + let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value); + if let Some(number_of_words) = number_of_words { index.documents_fields_counts.put_document_field_count( writer, document_id, From 65ed2dcc1b6976fb8a6e61f65af6ceb565f68f89 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 18 May 2020 12:22:41 +0200 Subject: [PATCH 03/10] Remove the serde ConvertToNumber --- .../src/serde/convert_to_number.rs | 198 ------------------ meilisearch-core/src/serde/mod.rs | 2 - .../src/update/documents_addition.rs | 24 ++- 3 files changed, 18 insertions(+), 206 deletions(-) delete mode 100644 meilisearch-core/src/serde/convert_to_number.rs diff --git a/meilisearch-core/src/serde/convert_to_number.rs b/meilisearch-core/src/serde/convert_to_number.rs deleted file mode 100644 index a67e01692..000000000 --- a/meilisearch-core/src/serde/convert_to_number.rs +++ /dev/null @@ -1,198 +0,0 @@ -use std::str::FromStr; - -use ordered_float::OrderedFloat; -use serde::ser; -use serde::Serialize; - -use super::SerializerError; -use crate::Number; - -pub struct ConvertToNumber; - -impl ser::Serializer for ConvertToNumber { - type Ok = Number; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - fn serialize_bool(self, value: bool) -> Result { - Ok(Number::Unsigned(u64::from(value))) - } - - fn serialize_char(self, _value: char) -> Result { - Err(SerializerError::UnrankableType { type_name: "char" }) - } - - fn serialize_i8(self, value: i8) -> Result { - Ok(Number::Signed(i64::from(value))) - } - - fn serialize_i16(self, value: i16) -> Result { - Ok(Number::Signed(i64::from(value))) - } - - fn serialize_i32(self, value: i32) -> Result { - Ok(Number::Signed(i64::from(value))) - } - - fn serialize_i64(self, value: i64) -> Result { - Ok(Number::Signed(value)) - } - - fn serialize_u8(self, value: u8) -> Result { - Ok(Number::Unsigned(u64::from(value))) - } - - fn serialize_u16(self, value: u16) -> Result { - Ok(Number::Unsigned(u64::from(value))) - } - - fn serialize_u32(self, value: u32) -> Result { - Ok(Number::Unsigned(u64::from(value))) - } - - fn serialize_u64(self, value: u64) -> Result { - Ok(Number::Unsigned(value)) - } - - fn serialize_f32(self, value: f32) -> Result { - Ok(Number::Float(OrderedFloat(f64::from(value)))) - } - - fn serialize_f64(self, value: f64) -> Result { - Ok(Number::Float(OrderedFloat(value))) - } - - fn serialize_str(self, value: &str) -> Result { - Ok(Number::from_str(value)?) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnrankableType { type_name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnrankableType { - type_name: "Option", - }) - } - - fn serialize_some(self, _value: &T) -> Result - where - T: Serialize, - { - Err(SerializerError::UnrankableType { - type_name: "Option", - }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnrankableType { type_name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnrankableType { - type_name: "unit struct", - }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(SerializerError::UnrankableType { - type_name: "unit variant", - }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T, - ) -> Result - where - T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(SerializerError::UnrankableType { - type_name: "newtype variant", - }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnrankableType { - type_name: "sequence", - }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnrankableType { type_name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnrankableType { - type_name: "tuple struct", - }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnrankableType { - type_name: "tuple variant", - }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnrankableType { type_name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnrankableType { - type_name: "struct", - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnrankableType { - type_name: "struct variant", - }) - } -} diff --git a/meilisearch-core/src/serde/mod.rs b/meilisearch-core/src/serde/mod.rs index 6a1e51a09..50e18687e 100644 --- a/meilisearch-core/src/serde/mod.rs +++ b/meilisearch-core/src/serde/mod.rs @@ -8,12 +8,10 @@ macro_rules! forward_to_unserializable_type { } } -mod convert_to_number; mod convert_to_string; mod deserializer; mod extract_document_id; -pub use self::convert_to_number::ConvertToNumber; pub use self::convert_to_string::ConvertToString; pub use self::deserializer::{Deserializer, DeserializerError}; pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index b7e6395af..ddb3c8340 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,11 +1,10 @@ use std::collections::HashMap; use std::fmt::Write as _; -use std::fmt; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; use sdset::{duo::Union, SetOperation}; -use serde::{Deserialize, Serialize}; +use serde::Deserialize; use serde_json::Value; use meilisearch_types::DocumentId; @@ -16,10 +15,9 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; use crate::serde::{extract_document_id, Deserializer}; -use crate::serde::ConvertToNumber; use crate::store; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; -use crate::{Error, MResult, RankedMap}; +use crate::{Error, Number, MResult, RankedMap}; pub struct DocumentsAddition { updates_store: store::Updates, @@ -174,6 +172,20 @@ fn index_value( } } +// TODO move this helper functions elsewhere +fn value_to_number(value: &Value) -> Option { + use std::str::FromStr; + + match value { + Value::Null => None, + Value::Bool(boolean) => Some(Number::Unsigned(*boolean as u64)), + Value::Number(number) => Number::from_str(&number.to_string()).ok(), // TODO improve that + Value::String(string) => Number::from_str(string).ok(), + Value::Array(_array) => None, + Value::Object(_object) => None, + } +} + pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, @@ -263,7 +275,7 @@ pub fn apply_addition<'a, 'b>( } if schema.is_ranked(field_id) { - let number = value.serialize(ConvertToNumber).unwrap_or_default(); + let number = value_to_number(&value).unwrap_or_default(); ranked_map.insert(document_id, field_id, number); } } @@ -360,7 +372,7 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind } if schema.is_ranked(field_id) { - let number = value.serialize(ConvertToNumber).unwrap_or_default(); + let number = value_to_number(&value).unwrap_or_default(); ranked_map.insert(document_id, field_id, number); } } From 2558ce9a00676eacac8cbf7964c61585000c17ba Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 18 May 2020 12:40:33 +0200 Subject: [PATCH 04/10] Export the value_to_string helper function --- .../src/update/documents_addition.rs | 59 ++++++++++--------- 1 file changed, 32 insertions(+), 27 deletions(-) diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index ddb3c8340..e737e5068 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -118,29 +118,6 @@ fn index_value( value: &Value, ) -> Option { - fn value_to_string(string: &mut String, value: &Value) { - match value { - Value::Null => (), - Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); }, - Value::Number(number) => { let _ = write!(string, "{}", &number); }, - Value::String(text) => string.push_str(&text), - Value::Array(array) => { - for value in array { - value_to_string(string, value); - let _ = string.write_str(". "); - } - }, - Value::Object(object) => { - for (key, value) in object { - string.push_str(key); - let _ = string.write_str(". "); - value_to_string(string, value); - let _ = string.write_str(". "); - } - }, - } - } - match value { Value::Null => None, Value::Bool(boolean) => { @@ -158,20 +135,48 @@ fn index_value( Some(number_of_words) }, Value::Array(_) => { - let mut text = String::new(); - value_to_string(&mut text, value); + let text = value_to_string(value); let number_of_words = indexer.index_text(document_id, indexed_pos, &text); Some(number_of_words) }, Value::Object(_) => { - let mut text = String::new(); - value_to_string(&mut text, value); + let text = value_to_string(value); let number_of_words = indexer.index_text(document_id, indexed_pos, &text); Some(number_of_words) }, } } +// TODO move this helper functions elsewhere +fn value_to_string(value: &Value) -> String { + fn internal_value_to_string(string: &mut String, value: &Value) { + match value { + Value::Null => (), + Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); }, + Value::Number(number) => { let _ = write!(string, "{}", &number); }, + Value::String(text) => string.push_str(&text), + Value::Array(array) => { + for value in array { + internal_value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + Value::Object(object) => { + for (key, value) in object { + string.push_str(key); + let _ = string.write_str(". "); + internal_value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + } + } + + let mut string = String::new(); + internal_value_to_string(&mut string, value); + string +} + // TODO move this helper functions elsewhere fn value_to_number(value: &Value) -> Option { use std::str::FromStr; From 25b3c9a057ab06f5e3b1207163c77c095a4363c9 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 18 May 2020 13:19:19 +0200 Subject: [PATCH 05/10] Remove the serde ExtractDocumentId struct --- meilisearch-core/src/lib.rs | 2 +- .../src/serde/convert_to_string.rs | 279 ---------------- .../src/serde/extract_document_id.rs | 310 ------------------ meilisearch-core/src/serde/mod.rs | 18 +- .../src/update/documents_addition.rs | 44 ++- .../src/update/documents_deletion.rs | 17 - meilisearch-core/src/update/mod.rs | 1 + meilisearch-http/src/routes/document.rs | 11 +- 8 files changed, 47 insertions(+), 635 deletions(-) delete mode 100644 meilisearch-core/src/serde/convert_to_string.rs delete mode 100644 meilisearch-core/src/serde/extract_document_id.rs diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 179c7746c..9cec35976 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -18,7 +18,7 @@ mod query_words_mapper; mod ranked_map; mod raw_document; mod reordered_attrs; -mod update; +pub mod update; pub mod criterion; pub mod facets; pub mod raw_indexer; diff --git a/meilisearch-core/src/serde/convert_to_string.rs b/meilisearch-core/src/serde/convert_to_string.rs deleted file mode 100644 index cd072c904..000000000 --- a/meilisearch-core/src/serde/convert_to_string.rs +++ /dev/null @@ -1,279 +0,0 @@ -use serde::ser; -use serde::Serialize; - -use super::SerializerError; - -pub struct ConvertToString; - -impl ser::Serializer for ConvertToString { - type Ok = String; - type Error = SerializerError; - type SerializeSeq = SeqConvertToString; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapConvertToString; - type SerializeStruct = StructConvertToString; - type SerializeStructVariant = ser::Impossible; - - fn serialize_bool(self, value: bool) -> Result { - Ok(value.to_string()) - } - - fn serialize_char(self, value: char) -> Result { - Ok(value.to_string()) - } - - fn serialize_i8(self, value: i8) -> Result { - Ok(value.to_string()) - } - - fn serialize_i16(self, value: i16) -> Result { - Ok(value.to_string()) - } - - fn serialize_i32(self, value: i32) -> Result { - Ok(value.to_string()) - } - - fn serialize_i64(self, value: i64) -> Result { - Ok(value.to_string()) - } - - fn serialize_u8(self, value: u8) -> Result { - Ok(value.to_string()) - } - - fn serialize_u16(self, value: u16) -> Result { - Ok(value.to_string()) - } - - fn serialize_u32(self, value: u32) -> Result { - Ok(value.to_string()) - } - - fn serialize_u64(self, value: u64) -> Result { - Ok(value.to_string()) - } - - fn serialize_f32(self, value: f32) -> Result { - Ok(value.to_string()) - } - - fn serialize_f64(self, value: f64) -> Result { - Ok(value.to_string()) - } - - fn serialize_str(self, value: &str) -> Result { - Ok(value.to_string()) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { type_name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { - type_name: "Option", - }) - } - - fn serialize_some(self, _value: &T) -> Result - where - T: Serialize, - { - Err(SerializerError::UnserializableType { - type_name: "Option", - }) - } - - fn serialize_unit(self) -> Result { - Ok(String::new()) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { - type_name: "unit struct", - }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "unit variant", - }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T, - ) -> Result - where - T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(SerializerError::UnserializableType { - type_name: "newtype variant", - }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Ok(SeqConvertToString { - text: String::new(), - }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { type_name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "tuple struct", - }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "tuple variant", - }) - } - - fn serialize_map(self, _len: Option) -> Result { - Ok(MapConvertToString { - text: String::new(), - }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Ok(StructConvertToString { - text: String::new(), - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "struct variant", - }) - } -} - -pub struct MapConvertToString { - text: String, -} - -impl ser::SerializeMap for MapConvertToString { - type Ok = String; - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = key.serialize(ConvertToString)?; - self.text.push_str(&text); - self.text.push_str(" "); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = value.serialize(ConvertToString)?; - self.text.push_str(&text); - Ok(()) - } - - fn end(self) -> Result { - Ok(self.text) - } -} - -pub struct StructConvertToString { - text: String, -} - -impl ser::SerializeStruct for StructConvertToString { - type Ok = String; - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T, - ) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let value = value.serialize(ConvertToString)?; - self.text.push_str(key); - self.text.push_str(" "); - self.text.push_str(&value); - Ok(()) - } - - fn end(self) -> Result { - Ok(self.text) - } -} - -pub struct SeqConvertToString { - text: String, -} - -impl ser::SerializeSeq for SeqConvertToString { - type Ok = String; - type Error = SerializerError; - - fn serialize_element(&mut self, key: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = key.serialize(ConvertToString)?; - self.text.push_str(&text); - self.text.push_str(" "); - Ok(()) - } - - fn end(self) -> Result { - Ok(self.text) - } -} diff --git a/meilisearch-core/src/serde/extract_document_id.rs b/meilisearch-core/src/serde/extract_document_id.rs deleted file mode 100644 index c7303a8ec..000000000 --- a/meilisearch-core/src/serde/extract_document_id.rs +++ /dev/null @@ -1,310 +0,0 @@ -use std::hash::{Hash, Hasher}; - -use crate::DocumentId; -use serde::{ser, Serialize}; -use serde_json::{Value, Number}; -use siphasher::sip::SipHasher; - -use super::{ConvertToString, SerializerError}; - -pub fn extract_document_id( - primary_key: &str, - document: &D, -) -> Result, SerializerError> -where - D: serde::Serialize, -{ - let serializer = ExtractDocumentId { primary_key }; - document.serialize(serializer) -} - -fn validate_number(value: &Number) -> Option { - if value.is_f64() { - return None - } - Some(value.to_string()) -} - -fn validate_string(value: &str) -> Option { - if value.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { - Some(value.to_string()) - } else { - None - } -} - -pub fn value_to_string(value: &Value) -> Option { - match value { - Value::Null => None, - Value::Bool(_) => None, - Value::Number(value) => validate_number(value), - Value::String(value) => validate_string(value), - Value::Array(_) => None, - Value::Object(_) => None, - } -} - -pub fn compute_document_id(t: H) -> DocumentId { - let mut s = SipHasher::new(); - t.hash(&mut s); - let hash = s.finish(); - DocumentId(hash) -} - -struct ExtractDocumentId<'a> { - primary_key: &'a str, -} - -impl<'a> ser::Serializer for ExtractDocumentId<'a> { - type Ok = Option; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ExtractDocumentIdMapSerializer<'a>; - type SerializeStruct = ExtractDocumentIdStructSerializer<'a>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _value: &str) -> Result { - Err(SerializerError::UnserializableType { type_name: "str" }) - } - - fn serialize_bytes(self, _value: &[u8]) -> Result { - Err(SerializerError::UnserializableType { type_name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { - type_name: "Option", - }) - } - - fn serialize_some(self, _value: &T) -> Result - where - T: Serialize, - { - Err(SerializerError::UnserializableType { - type_name: "Option", - }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { type_name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { - type_name: "unit struct", - }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "unit variant", - }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T, - ) -> Result - where - T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: Serialize, - { - Err(SerializerError::UnserializableType { - type_name: "newtype variant", - }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { - type_name: "sequence", - }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { type_name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "tuple struct", - }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "tuple variant", - }) - } - - fn serialize_map(self, _len: Option) -> Result { - let serializer = ExtractDocumentIdMapSerializer { - primary_key: self.primary_key, - document_id: None, - current_key_name: None, - }; - - Ok(serializer) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - let serializer = ExtractDocumentIdStructSerializer { - primary_key: self.primary_key, - document_id: None, - }; - - Ok(serializer) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnserializableType { - type_name: "struct variant", - }) - } -} - -pub struct ExtractDocumentIdMapSerializer<'a> { - primary_key: &'a str, - document_id: Option, - current_key_name: Option, -} - -impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - let key = key.serialize(ConvertToString)?; - self.current_key_name = Some(key); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - let key = self.current_key_name.take().unwrap(); - self.serialize_entry(&key, value) - } - - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where - K: Serialize, - V: Serialize, - { - let key = key.serialize(ConvertToString)?; - - if self.primary_key == key { - let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?; - match value_to_string(&value).map(|s| compute_document_id(&s)) { - Some(document_id) => self.document_id = Some(document_id), - None => return Err(SerializerError::InvalidDocumentIdType), - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(self.document_id) - } -} - -pub struct ExtractDocumentIdStructSerializer<'a> { - primary_key: &'a str, - document_id: Option, -} - -impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T, - ) -> Result<(), Self::Error> - where - T: Serialize, - { - if self.primary_key == key { - let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?; - match value_to_string(&value).map(compute_document_id) { - Some(document_id) => self.document_id = Some(document_id), - None => return Err(SerializerError::InvalidDocumentIdType), - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(self.document_id) - } -} diff --git a/meilisearch-core/src/serde/mod.rs b/meilisearch-core/src/serde/mod.rs index 50e18687e..90b2843c9 100644 --- a/meilisearch-core/src/serde/mod.rs +++ b/meilisearch-core/src/serde/mod.rs @@ -1,20 +1,6 @@ -macro_rules! forward_to_unserializable_type { - ($($ty:ident => $se_method:ident,)*) => { - $( - fn $se_method(self, _v: $ty) -> Result { - Err(SerializerError::UnserializableType { type_name: "$ty" }) - } - )* - } -} - -mod convert_to_string; mod deserializer; -mod extract_document_id; -pub use self::convert_to_string::ConvertToString; pub use self::deserializer::{Deserializer, DeserializerError}; -pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; use std::{error::Error, fmt}; @@ -27,7 +13,7 @@ use crate::ParseNumberError; #[derive(Debug)] pub enum SerializerError { DocumentIdNotFound, - InvalidDocumentIdType, + InvalidDocumentIdFormat, Zlmdb(heed::Error), SerdeJson(SerdeJsonError), ParseNumber(ParseNumberError), @@ -50,7 +36,7 @@ impl fmt::Display for SerializerError { SerializerError::DocumentIdNotFound => { f.write_str("serialized document does not have an id according to the schema") } - SerializerError::InvalidDocumentIdType => { + SerializerError::InvalidDocumentIdFormat => { f.write_str("a document primary key can be of type integer or string only composed of alphanumeric characters, hyphens (-) and underscores (_).") } SerializerError::Zlmdb(e) => write!(f, "heed related error: {}", e), diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index e737e5068..ec99858e2 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,11 +1,13 @@ use std::collections::HashMap; use std::fmt::Write as _; +use std::hash::{Hash, Hasher}; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; use sdset::{duo::Union, SetOperation}; use serde::Deserialize; use serde_json::Value; +use siphasher::sip::SipHasher; use meilisearch_types::DocumentId; use meilisearch_schema::IndexedPos; @@ -14,7 +16,7 @@ use crate::database::{MainT, UpdateT}; use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; -use crate::serde::{extract_document_id, Deserializer}; +use crate::serde::{Deserializer, SerializerError}; use crate::store; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::{Error, Number, MResult, RankedMap}; @@ -148,7 +150,7 @@ fn index_value( } // TODO move this helper functions elsewhere -fn value_to_string(value: &Value) -> String { +pub fn value_to_string(value: &Value) -> String { fn internal_value_to_string(string: &mut String, value: &Value) { match value { Value::Null => (), @@ -191,6 +193,39 @@ fn value_to_number(value: &Value) -> Option { } } +// TODO move this helper functions elsewhere +pub fn compute_document_id(t: H) -> DocumentId { + let mut s = SipHasher::new(); + t.hash(&mut s); + let hash = s.finish(); + DocumentId(hash) +} + +// TODO move this helper functions elsewhere +pub fn extract_document_id(primary_key: &str, document: &IndexMap) -> Result { + + fn validate_document_id(string: &str) -> bool { + string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') + } + + match document.get(primary_key) { + Some(value) => { + let string = match value { + Value::Number(number) => number.to_string(), + Value::String(string) => string.clone(), + _ => return Err(SerializerError::InvalidDocumentIdFormat), + }; + + if validate_document_id(&string) { + Ok(compute_document_id(string)) + } else { + Err(SerializerError::InvalidDocumentIdFormat) + } + } + None => Err(SerializerError::DocumentIdNotFound), + } +} + pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, @@ -208,10 +243,7 @@ pub fn apply_addition<'a, 'b>( // 1. store documents ids for future deletion for mut document in addition { - let document_id = match extract_document_id(&primary_key, &document)? { - Some(id) => id, - None => return Err(Error::MissingDocumentId), - }; + let document_id = extract_document_id(&primary_key, &document)?; if partial { let mut deserializer = Deserializer { diff --git a/meilisearch-core/src/update/documents_deletion.rs b/meilisearch-core/src/update/documents_deletion.rs index 30d563efb..4526d053d 100644 --- a/meilisearch-core/src/update/documents_deletion.rs +++ b/meilisearch-core/src/update/documents_deletion.rs @@ -1,13 +1,11 @@ use std::collections::{BTreeSet, HashMap, HashSet}; use fst::{SetBuilder, Streamer}; -use meilisearch_schema::Schema; use sdset::{duo::DifferenceByKey, SetBuf, SetOperation}; use crate::database::{MainT, UpdateT}; use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; -use crate::serde::extract_document_id; use crate::store; use crate::update::{next_update_id, compute_short_prefixes, Update}; use crate::{DocumentId, Error, MResult, RankedMap}; @@ -37,21 +35,6 @@ impl DocumentsDeletion { self.documents.push(document_id); } - pub fn delete_document(&mut self, schema: &Schema, document: D) -> MResult<()> - where - D: serde::Serialize, - { - let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?; - let document_id = match extract_document_id(&primary_key, &document)? { - Some(id) => id, - None => return Err(Error::MissingDocumentId), - }; - - self.delete_document_by_id(document_id); - - Ok(()) - } - pub fn finalize(self, writer: &mut heed::RwTxn) -> MResult { let _ = self.updates_notifier.send(UpdateEvent::NewUpdate); let update_id = push_documents_deletion( diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index d9a2d41a7..6599c3f99 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -8,6 +8,7 @@ pub use self::clear_all::{apply_clear_all, push_clear_all}; pub use self::customs_update::{apply_customs_update, push_customs_update}; pub use self::documents_addition::{ apply_documents_addition, apply_documents_partial_addition, DocumentsAddition, + value_to_string, compute_document_id, extract_document_id, }; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; pub use self::settings_update::{apply_settings_update, push_settings_update}; diff --git a/meilisearch-http/src/routes/document.rs b/meilisearch-http/src/routes/document.rs index ad90df8ca..4ef4027dc 100644 --- a/meilisearch-http/src/routes/document.rs +++ b/meilisearch-http/src/routes/document.rs @@ -42,7 +42,7 @@ async fn get_document( .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - let document_id = meilisearch_core::serde::compute_document_id(&path.document_id); + let document_id = meilisearch_core::update::compute_document_id(&path.document_id); let reader = data.db.main_read_txn()?; @@ -65,7 +65,7 @@ async fn delete_document( .db .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - let document_id = meilisearch_core::serde::compute_document_id(&path.document_id); + let document_id = meilisearch_core::update::compute_document_id(&path.document_id); let mut update_writer = data.db.update_write_txn()?; @@ -237,10 +237,9 @@ async fn delete_documents( let mut documents_deletion = index.documents_deletion(); for document_id in body.into_inner() { - if let Some(document_id) = meilisearch_core::serde::value_to_string(&document_id) { - documents_deletion - .delete_document_by_id(meilisearch_core::serde::compute_document_id(document_id)); - } + let document_id_string = meilisearch_core::update::value_to_string(&document_id); + let document_id = meilisearch_core::update::compute_document_id(document_id_string); + documents_deletion.delete_document_by_id(document_id); } let update_id = documents_deletion.finalize(&mut writer)?; From 2828b5fa191ebf2091c3ed2ce1c5c4877d4551c2 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 19 May 2020 14:37:13 +0200 Subject: [PATCH 06/10] Move the helper function to their own module --- .../src/update/documents_addition.rs | 126 +----------------- meilisearch-core/src/update/helpers.rs | 122 +++++++++++++++++ meilisearch-core/src/update/mod.rs | 16 +-- meilisearch-http/src/routes/document.rs | 22 ++- 4 files changed, 151 insertions(+), 135 deletions(-) create mode 100644 meilisearch-core/src/update/helpers.rs diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index ec99858e2..7f3ae178b 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,25 +1,20 @@ use std::collections::HashMap; -use std::fmt::Write as _; -use std::hash::{Hash, Hasher}; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; use sdset::{duo::Union, SetOperation}; use serde::Deserialize; use serde_json::Value; -use siphasher::sip::SipHasher; - -use meilisearch_types::DocumentId; -use meilisearch_schema::IndexedPos; use crate::database::{MainT, UpdateT}; use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; -use crate::serde::{Deserializer, SerializerError}; +use crate::serde::Deserializer; use crate::store; +use crate::update::helpers::{index_value, value_to_number, extract_document_id}; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; -use crate::{Error, Number, MResult, RankedMap}; +use crate::{Error, MResult, RankedMap}; pub struct DocumentsAddition { updates_store: store::Updates, @@ -111,121 +106,6 @@ pub fn push_documents_addition( Ok(last_update_id) } -// TODO move this helper functions elsewhere -/// Returns the number of words indexed or `None` if the type -fn index_value( - indexer: &mut RawIndexer, - document_id: DocumentId, - indexed_pos: IndexedPos, - value: &Value, -) -> Option -{ - match value { - Value::Null => None, - Value::Bool(boolean) => { - let text = boolean.to_string(); - let number_of_words = indexer.index_text(document_id, indexed_pos, &text); - Some(number_of_words) - }, - Value::Number(number) => { - let text = number.to_string(); - let number_of_words = indexer.index_text(document_id, indexed_pos, &text); - Some(number_of_words) - }, - Value::String(string) => { - let number_of_words = indexer.index_text(document_id, indexed_pos, &string); - Some(number_of_words) - }, - Value::Array(_) => { - let text = value_to_string(value); - let number_of_words = indexer.index_text(document_id, indexed_pos, &text); - Some(number_of_words) - }, - Value::Object(_) => { - let text = value_to_string(value); - let number_of_words = indexer.index_text(document_id, indexed_pos, &text); - Some(number_of_words) - }, - } -} - -// TODO move this helper functions elsewhere -pub fn value_to_string(value: &Value) -> String { - fn internal_value_to_string(string: &mut String, value: &Value) { - match value { - Value::Null => (), - Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); }, - Value::Number(number) => { let _ = write!(string, "{}", &number); }, - Value::String(text) => string.push_str(&text), - Value::Array(array) => { - for value in array { - internal_value_to_string(string, value); - let _ = string.write_str(". "); - } - }, - Value::Object(object) => { - for (key, value) in object { - string.push_str(key); - let _ = string.write_str(". "); - internal_value_to_string(string, value); - let _ = string.write_str(". "); - } - }, - } - } - - let mut string = String::new(); - internal_value_to_string(&mut string, value); - string -} - -// TODO move this helper functions elsewhere -fn value_to_number(value: &Value) -> Option { - use std::str::FromStr; - - match value { - Value::Null => None, - Value::Bool(boolean) => Some(Number::Unsigned(*boolean as u64)), - Value::Number(number) => Number::from_str(&number.to_string()).ok(), // TODO improve that - Value::String(string) => Number::from_str(string).ok(), - Value::Array(_array) => None, - Value::Object(_object) => None, - } -} - -// TODO move this helper functions elsewhere -pub fn compute_document_id(t: H) -> DocumentId { - let mut s = SipHasher::new(); - t.hash(&mut s); - let hash = s.finish(); - DocumentId(hash) -} - -// TODO move this helper functions elsewhere -pub fn extract_document_id(primary_key: &str, document: &IndexMap) -> Result { - - fn validate_document_id(string: &str) -> bool { - string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') - } - - match document.get(primary_key) { - Some(value) => { - let string = match value { - Value::Number(number) => number.to_string(), - Value::String(string) => string.clone(), - _ => return Err(SerializerError::InvalidDocumentIdFormat), - }; - - if validate_document_id(&string) { - Ok(compute_document_id(string)) - } else { - Err(SerializerError::InvalidDocumentIdFormat) - } - } - None => Err(SerializerError::DocumentIdNotFound), - } -} - pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs new file mode 100644 index 000000000..a3be7bb22 --- /dev/null +++ b/meilisearch-core/src/update/helpers.rs @@ -0,0 +1,122 @@ +use std::fmt::Write as _; +use std::hash::{Hash, Hasher}; + +use indexmap::IndexMap; +use meilisearch_schema::IndexedPos; +use meilisearch_types::DocumentId; +use serde_json::Value; +use siphasher::sip::SipHasher; + +use crate::raw_indexer::RawIndexer; +use crate::serde::SerializerError; +use crate::Number; + +/// Returns the number of words indexed or `None` if the type is unindexable. +pub fn index_value( + indexer: &mut RawIndexer, + document_id: DocumentId, + indexed_pos: IndexedPos, + value: &Value, +) -> Option +{ + match value { + Value::Null => None, + Value::Bool(boolean) => { + let text = boolean.to_string(); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + Value::Number(number) => { + let text = number.to_string(); + Some(indexer.index_text(document_id, indexed_pos, &text)) + }, + Value::String(string) => { + Some(indexer.index_text(document_id, indexed_pos, &string)) + }, + Value::Array(_) => { + let text = value_to_string(value); + Some(indexer.index_text(document_id, indexed_pos, &text)) + }, + Value::Object(_) => { + let text = value_to_string(value); + Some(indexer.index_text(document_id, indexed_pos, &text)) + }, + } +} + +/// Transforms the JSON Value type into a String. +pub fn value_to_string(value: &Value) -> String { + fn internal_value_to_string(string: &mut String, value: &Value) { + match value { + Value::Null => (), + Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); }, + Value::Number(number) => { let _ = write!(string, "{}", &number); }, + Value::String(text) => string.push_str(&text), + Value::Array(array) => { + for value in array { + internal_value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + Value::Object(object) => { + for (key, value) in object { + string.push_str(key); + let _ = string.write_str(". "); + internal_value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + } + } + + let mut string = String::new(); + internal_value_to_string(&mut string, value); + string +} + +/// Transforms the JSON Value type into a Number. +pub fn value_to_number(value: &Value) -> Option { + use std::str::FromStr; + + match value { + Value::Null => None, + Value::Bool(boolean) => Some(Number::Unsigned(*boolean as u64)), + Value::Number(number) => Number::from_str(&number.to_string()).ok(), // TODO improve that + Value::String(string) => Number::from_str(string).ok(), + Value::Array(_array) => None, + Value::Object(_object) => None, + } +} + +/// Compute the hash of the given type, this is the way we produce documents ids. +pub fn compute_document_id(t: H) -> DocumentId { + let mut s = SipHasher::new(); + t.hash(&mut s); + let hash = s.finish(); + DocumentId(hash) +} + +/// Validates a string representation to be a correct document id. +pub fn validate_document_id(string: &str) -> bool { + string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') +} + +/// Extracts and validates the document id of a document. +pub fn extract_document_id(primary_key: &str, document: &IndexMap) -> Result { + match document.get(primary_key) { + Some(value) => { + let string = match value { + Value::Number(number) => number.to_string(), + Value::String(string) => string.clone(), + _ => return Err(SerializerError::InvalidDocumentIdFormat), + }; + + if validate_document_id(&string) { + Ok(compute_document_id(string)) + } else { + Err(SerializerError::InvalidDocumentIdFormat) + } + } + None => Err(SerializerError::DocumentIdNotFound), + } +} diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index 6599c3f99..124e6450a 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -3,14 +3,13 @@ mod customs_update; mod documents_addition; mod documents_deletion; mod settings_update; +mod helpers; pub use self::clear_all::{apply_clear_all, push_clear_all}; pub use self::customs_update::{apply_customs_update, push_customs_update}; -pub use self::documents_addition::{ - apply_documents_addition, apply_documents_partial_addition, DocumentsAddition, - value_to_string, compute_document_id, extract_document_id, -}; +pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; +pub use self::helpers::{index_value, value_to_string, value_to_number, compute_document_id, extract_document_id, validate_document_id}; pub use self::settings_update::{apply_settings_update, push_settings_update}; use std::cmp; @@ -23,6 +22,7 @@ use indexmap::IndexMap; use log::debug; use sdset::Set; use serde::{Deserialize, Serialize}; +use serde_json::Value; use crate::{store, DocumentId, MResult}; use crate::database::{MainT, UpdateT}; @@ -49,14 +49,14 @@ impl Update { } } - fn documents_addition(data: Vec>) -> Update { + fn documents_addition(data: Vec>) -> Update { Update { data: UpdateData::DocumentsAddition(data), enqueued_at: Utc::now(), } } - fn documents_partial(data: Vec>) -> Update { + fn documents_partial(data: Vec>) -> Update { Update { data: UpdateData::DocumentsPartial(data), enqueued_at: Utc::now(), @@ -82,8 +82,8 @@ impl Update { pub enum UpdateData { ClearAll, Customs(Vec), - DocumentsAddition(Vec>), - DocumentsPartial(Vec>), + DocumentsAddition(Vec>), + DocumentsPartial(Vec>), DocumentsDeletion(Vec), Settings(SettingsUpdate) } diff --git a/meilisearch-http/src/routes/document.rs b/meilisearch-http/src/routes/document.rs index 4ef4027dc..22586ecb6 100644 --- a/meilisearch-http/src/routes/document.rs +++ b/meilisearch-http/src/routes/document.rs @@ -6,6 +6,9 @@ use indexmap::IndexMap; use serde::Deserialize; use serde_json::Value; +use meilisearch_core::{Error, serde::SerializerError}; +use meilisearch_core::update; + use crate::error::ResponseError; use crate::helpers::Authentication; use crate::routes::{IndexParam, IndexUpdateResponse}; @@ -42,8 +45,11 @@ async fn get_document( .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - let document_id = meilisearch_core::update::compute_document_id(&path.document_id); + if !update::validate_document_id(&path.document_id) { + return Err(Error::Serializer(SerializerError::InvalidDocumentIdFormat).into()) + } + let document_id = update::compute_document_id(&path.document_id); let reader = data.db.main_read_txn()?; let response: Document = index @@ -65,7 +71,12 @@ async fn delete_document( .db .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - let document_id = meilisearch_core::update::compute_document_id(&path.document_id); + + if !update::validate_document_id(&path.document_id) { + return Err(Error::Serializer(SerializerError::InvalidDocumentIdFormat).into()) + } + + let document_id = update::compute_document_id(&path.document_id); let mut update_writer = data.db.update_write_txn()?; @@ -237,8 +248,11 @@ async fn delete_documents( let mut documents_deletion = index.documents_deletion(); for document_id in body.into_inner() { - let document_id_string = meilisearch_core::update::value_to_string(&document_id); - let document_id = meilisearch_core::update::compute_document_id(document_id_string); + let document_id_string = update::value_to_string(&document_id); + if !update::validate_document_id(&document_id_string) { + return Err(Error::Serializer(SerializerError::InvalidDocumentIdFormat).into()) + } + let document_id = update::compute_document_id(document_id_string); documents_deletion.delete_document_by_id(document_id); } From d300d788c7e7cfbdad7f05b13064032af3ed4394 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 18 May 2020 15:18:23 +0200 Subject: [PATCH 07/10] Make the compute_document_id validate the id --- meilisearch-core/src/update/helpers.rs | 28 ++++++++++--------------- meilisearch-core/src/update/mod.rs | 2 +- meilisearch-http/src/routes/document.rs | 23 +++++--------------- 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs index a3be7bb22..a0a0ee266 100644 --- a/meilisearch-core/src/update/helpers.rs +++ b/meilisearch-core/src/update/helpers.rs @@ -88,17 +88,16 @@ pub fn value_to_number(value: &Value) -> Option { } } -/// Compute the hash of the given type, this is the way we produce documents ids. -pub fn compute_document_id(t: H) -> DocumentId { - let mut s = SipHasher::new(); - t.hash(&mut s); - let hash = s.finish(); - DocumentId(hash) -} - -/// Validates a string representation to be a correct document id. -pub fn validate_document_id(string: &str) -> bool { - string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') +/// Validates a string representation to be a correct document id and +/// returns the hash of the given type, this is the way we produce documents ids. +pub fn compute_document_id(string: &str) -> Result { + if string.chars().all(|x| x.is_ascii_alphanumeric() || x == '-' || x == '_') { + let mut s = SipHasher::new(); + string.hash(&mut s); + Ok(DocumentId(s.finish())) + } else { + Err(SerializerError::InvalidDocumentIdFormat) + } } /// Extracts and validates the document id of a document. @@ -110,12 +109,7 @@ pub fn extract_document_id(primary_key: &str, document: &IndexMap Value::String(string) => string.clone(), _ => return Err(SerializerError::InvalidDocumentIdFormat), }; - - if validate_document_id(&string) { - Ok(compute_document_id(string)) - } else { - Err(SerializerError::InvalidDocumentIdFormat) - } + compute_document_id(&string) } None => Err(SerializerError::DocumentIdNotFound), } diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index 124e6450a..3a3499c3e 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -9,7 +9,7 @@ pub use self::clear_all::{apply_clear_all, push_clear_all}; pub use self::customs_update::{apply_customs_update, push_customs_update}; pub use self::documents_addition::{apply_documents_addition, apply_documents_partial_addition, DocumentsAddition}; pub use self::documents_deletion::{apply_documents_deletion, DocumentsDeletion}; -pub use self::helpers::{index_value, value_to_string, value_to_number, compute_document_id, extract_document_id, validate_document_id}; +pub use self::helpers::{index_value, value_to_string, value_to_number, compute_document_id, extract_document_id}; pub use self::settings_update::{apply_settings_update, push_settings_update}; use std::cmp; diff --git a/meilisearch-http/src/routes/document.rs b/meilisearch-http/src/routes/document.rs index 22586ecb6..3b71e656a 100644 --- a/meilisearch-http/src/routes/document.rs +++ b/meilisearch-http/src/routes/document.rs @@ -3,12 +3,10 @@ use std::collections::{BTreeSet, HashSet}; use actix_web::{web, HttpResponse}; use actix_web_macros::{delete, get, post, put}; use indexmap::IndexMap; +use meilisearch_core::{update, Error}; use serde::Deserialize; use serde_json::Value; -use meilisearch_core::{Error, serde::SerializerError}; -use meilisearch_core::update; - use crate::error::ResponseError; use crate::helpers::Authentication; use crate::routes::{IndexParam, IndexUpdateResponse}; @@ -45,11 +43,7 @@ async fn get_document( .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - if !update::validate_document_id(&path.document_id) { - return Err(Error::Serializer(SerializerError::InvalidDocumentIdFormat).into()) - } - - let document_id = update::compute_document_id(&path.document_id); + let document_id = update::compute_document_id(&path.document_id).map_err(Error::Serializer)?; let reader = data.db.main_read_txn()?; let response: Document = index @@ -72,11 +66,7 @@ async fn delete_document( .open_index(&path.index_uid) .ok_or(ResponseError::index_not_found(&path.index_uid))?; - if !update::validate_document_id(&path.document_id) { - return Err(Error::Serializer(SerializerError::InvalidDocumentIdFormat).into()) - } - - let document_id = update::compute_document_id(&path.document_id); + let document_id = update::compute_document_id(&path.document_id).map_err(Error::Serializer)?; let mut update_writer = data.db.update_write_txn()?; @@ -248,11 +238,8 @@ async fn delete_documents( let mut documents_deletion = index.documents_deletion(); for document_id in body.into_inner() { - let document_id_string = update::value_to_string(&document_id); - if !update::validate_document_id(&document_id_string) { - return Err(Error::Serializer(SerializerError::InvalidDocumentIdFormat).into()) - } - let document_id = update::compute_document_id(document_id_string); + let document_id = update::value_to_string(&document_id); + let document_id = update::compute_document_id(&document_id).map_err(Error::Serializer)?; documents_deletion.delete_document_by_id(document_id); } From 3026840530d9865da901ec31a68341fc81521758 Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Mon, 18 May 2020 15:29:58 +0200 Subject: [PATCH 08/10] Introduce an index_document helper function --- .../src/update/documents_addition.rs | 100 +++++++++++------- 1 file changed, 60 insertions(+), 40 deletions(-) diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index 7f3ae178b..3b1ac6810 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -2,6 +2,8 @@ use std::collections::HashMap; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; +use meilisearch_schema::{Schema, FieldId}; +use meilisearch_types::DocumentId; use sdset::{duo::Union, SetOperation}; use serde::Deserialize; use serde_json::Value; @@ -11,6 +13,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; use crate::serde::Deserializer; +use crate::store::{DocumentsFields, DocumentsFieldsCounts}; use crate::store; use crate::update::helpers::{index_value, value_to_number, extract_document_id}; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; @@ -106,6 +109,41 @@ pub fn push_documents_addition( Ok(last_update_id) } +fn index_document( + writer: &mut heed::RwTxn, + documents_fields: DocumentsFields, + documents_fields_counts: DocumentsFieldsCounts, + ranked_map: &mut RankedMap, + indexer: &mut RawIndexer, + schema: &Schema, + field_id: FieldId, + document_id: DocumentId, + value: &Value, +) -> MResult<()> +{ + let serialized = serde_json::to_vec(value)?; + documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; + + if let Some(indexed_pos) = schema.is_indexed(field_id) { + let number_of_words = index_value(indexer, document_id, *indexed_pos, value); + if let Some(number_of_words) = number_of_words { + documents_fields_counts.put_document_field_count( + writer, + document_id, + *indexed_pos, + number_of_words as u16, + )?; + } + } + + if schema.is_ranked(field_id) { + let number = value_to_number(value).unwrap_or_default(); + ranked_map.insert(document_id, field_id, number); + } + + Ok(()) +} + pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, @@ -171,30 +209,20 @@ pub fn apply_addition<'a, 'b>( // For each document in this update for (document_id, document) in documents_additions { - // For each key-value pair in the document. for (attribute, value) in document { - let field_id = schema.insert_and_index(&attribute)?; - let serialized = serde_json::to_vec(&value)?; - index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; - - if let Some(indexed_pos) = schema.is_indexed(field_id) { - let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value); - if let Some(number_of_words) = number_of_words { - index.documents_fields_counts.put_document_field_count( - writer, - document_id, - *indexed_pos, - number_of_words as u16, - )?; - } - } - - if schema.is_ranked(field_id) { - let number = value_to_number(&value).unwrap_or_default(); - ranked_map.insert(document_id, field_id, number); - } + index_document( + writer, + index.documents_fields, + index.documents_fields_counts, + &mut ranked_map, + &mut indexer, + &schema, + field_id, + document_id, + &value, + )?; } } @@ -273,25 +301,17 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind // For each key-value pair in the document. for ((document_id, field_id), value) in ram_store.drain() { - let serialized = serde_json::to_vec(&value)?; - index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; - - if let Some(indexed_pos) = schema.is_indexed(field_id) { - let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value); - if let Some(number_of_words) = number_of_words { - index.documents_fields_counts.put_document_field_count( - writer, - document_id, - *indexed_pos, - number_of_words as u16, - )?; - } - } - - if schema.is_ranked(field_id) { - let number = value_to_number(&value).unwrap_or_default(); - ranked_map.insert(document_id, field_id, number); - } + index_document( + writer, + index.documents_fields, + index.documents_fields_counts, + &mut ranked_map, + &mut indexer, + &schema, + field_id, + document_id, + &value, + )?; } } From ae30ee2ade9177ef01bbe1d383b0b0f6dec70f7c Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 19 May 2020 14:11:48 +0200 Subject: [PATCH 09/10] Clean up some comments and variable names --- meilisearch-core/src/lib.rs | 4 ++-- .../src/update/documents_addition.rs | 21 ++++++++----------- meilisearch-core/src/update/mod.rs | 8 +++---- 3 files changed, 15 insertions(+), 18 deletions(-) diff --git a/meilisearch-core/src/lib.rs b/meilisearch-core/src/lib.rs index 9cec35976..38a1a3c34 100644 --- a/meilisearch-core/src/lib.rs +++ b/meilisearch-core/src/lib.rs @@ -18,13 +18,13 @@ mod query_words_mapper; mod ranked_map; mod raw_document; mod reordered_attrs; -pub mod update; pub mod criterion; pub mod facets; pub mod raw_indexer; -pub mod settings; pub mod serde; +pub mod settings; pub mod store; +pub mod update; pub use self::database::{BoxUpdateFn, Database, DatabaseOptions, MainT, UpdateT}; pub use self::error::{Error, HeedError, FstError, MResult, pest_error, FacetError}; diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index 3b1ac6810..d9d1af328 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -13,8 +13,7 @@ use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; use crate::serde::Deserializer; -use crate::store::{DocumentsFields, DocumentsFieldsCounts}; -use crate::store; +use crate::store::{self, DocumentsFields, DocumentsFieldsCounts}; use crate::update::helpers::{index_value, value_to_number, extract_document_id}; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::{Error, MResult, RankedMap}; @@ -147,7 +146,7 @@ fn index_document( pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, - addition: Vec>, + new_documents: Vec>, partial: bool ) -> MResult<()> { let mut documents_additions = HashMap::new(); @@ -160,7 +159,7 @@ pub fn apply_addition<'a, 'b>( let primary_key = schema.primary_key().ok_or(Error::MissingPrimaryKey)?; // 1. store documents ids for future deletion - for mut document in addition { + for mut document in new_documents { let document_id = extract_document_id(&primary_key, &document)?; if partial { @@ -172,10 +171,8 @@ pub fn apply_addition<'a, 'b>( fields: None, }; - // retrieve the old document and - // update the new one with missing keys found in the old one - let result = Option::>::deserialize(&mut deserializer)?; - if let Some(old_document) = result { + let old_document = Option::>::deserialize(&mut deserializer)?; + if let Some(old_document) = old_document { for (key, value) in old_document { document.entry(key).or_insert(value); } @@ -242,17 +239,17 @@ pub fn apply_addition<'a, 'b>( pub fn apply_documents_partial_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, - addition: Vec>, + new_documents: Vec>, ) -> MResult<()> { - apply_addition(writer, index, addition, true) + apply_addition(writer, index, new_documents, true) } pub fn apply_documents_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, - addition: Vec>, + new_documents: Vec>, ) -> MResult<()> { - apply_addition(writer, index, addition, false) + apply_addition(writer, index, new_documents, false) } pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Index) -> MResult<()> { diff --git a/meilisearch-core/src/update/mod.rs b/meilisearch-core/src/update/mod.rs index 3a3499c3e..5b7c33c9d 100644 --- a/meilisearch-core/src/update/mod.rs +++ b/meilisearch-core/src/update/mod.rs @@ -49,16 +49,16 @@ impl Update { } } - fn documents_addition(data: Vec>) -> Update { + fn documents_addition(documents: Vec>) -> Update { Update { - data: UpdateData::DocumentsAddition(data), + data: UpdateData::DocumentsAddition(documents), enqueued_at: Utc::now(), } } - fn documents_partial(data: Vec>) -> Update { + fn documents_partial(documents: Vec>) -> Update { Update { - data: UpdateData::DocumentsPartial(data), + data: UpdateData::DocumentsPartial(documents), enqueued_at: Utc::now(), } } From e2b115f3a96a74f86f83907f3d7ec86bab7f133b Mon Sep 17 00:00:00 2001 From: Kerollmops Date: Tue, 19 May 2020 14:16:29 +0200 Subject: [PATCH 10/10] Improve Number extraction/conversion function --- meilisearch-core/src/update/helpers.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/meilisearch-core/src/update/helpers.rs b/meilisearch-core/src/update/helpers.rs index a0a0ee266..d17bea3b2 100644 --- a/meilisearch-core/src/update/helpers.rs +++ b/meilisearch-core/src/update/helpers.rs @@ -4,6 +4,7 @@ use std::hash::{Hash, Hasher}; use indexmap::IndexMap; use meilisearch_schema::IndexedPos; use meilisearch_types::DocumentId; +use ordered_float::OrderedFloat; use serde_json::Value; use siphasher::sip::SipHasher; @@ -81,7 +82,14 @@ pub fn value_to_number(value: &Value) -> Option { match value { Value::Null => None, Value::Bool(boolean) => Some(Number::Unsigned(*boolean as u64)), - Value::Number(number) => Number::from_str(&number.to_string()).ok(), // TODO improve that + Value::Number(number) => { + match (number.as_i64(), number.as_u64(), number.as_f64()) { + (Some(n), _, _) => Some(Number::Signed(n)), + (_, Some(n), _) => Some(Number::Unsigned(n)), + (_, _, Some(n)) => Some(Number::Float(OrderedFloat(n))), + (None, None, None) => None, + } + }, Value::String(string) => Number::from_str(string).ok(), Value::Array(_array) => None, Value::Object(_object) => None,