diff --git a/meilisearch-core/Cargo.toml b/meilisearch-core/Cargo.toml index 1706b9af6..d9b88f89b 100644 --- a/meilisearch-core/Cargo.toml +++ b/meilisearch-core/Cargo.toml @@ -34,7 +34,7 @@ pest_derive = "2.0" regex = "1.3.6" sdset = "0.4.0" serde = { version = "1.0.105", features = ["derive"] } -serde_json = "1.0.50" +serde_json = { version = "1.0.50", features = ["preserve_order"] } siphasher = "0.3.2" slice-group-by = "0.2.6" unicase = "2.6.0" diff --git a/meilisearch-core/src/serde/indexer.rs b/meilisearch-core/src/serde/indexer.rs deleted file mode 100644 index c8b6abeaf..000000000 --- a/meilisearch-core/src/serde/indexer.rs +++ /dev/null @@ -1,362 +0,0 @@ -use meilisearch_schema::IndexedPos; -use serde::ser; -use serde::Serialize; - -use super::{ConvertToString, SerializerError}; -use crate::raw_indexer::RawIndexer; -use crate::DocumentId; - -pub struct Indexer<'a> { - pub pos: IndexedPos, - pub indexer: &'a mut RawIndexer, - pub document_id: DocumentId, -} - -impl<'a> ser::Serializer for Indexer<'a> { - type Ok = Option; - type Error = SerializerError; - type SerializeSeq = SeqIndexer<'a>; - type SerializeTuple = TupleIndexer<'a>; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapIndexer<'a>; - type SerializeStruct = StructIndexer<'a>; - type SerializeStructVariant = ser::Impossible; - - fn serialize_bool(self, _value: bool) -> Result { - Ok(None) - } - - fn serialize_char(self, value: char) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i8(self, value: i8) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i16(self, value: i16) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i32(self, value: i32) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_i64(self, value: i64) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u8(self, value: u8) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u16(self, value: u16) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u32(self, value: u32) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_u64(self, value: u64) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_f32(self, value: f32) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_f64(self, value: f64) -> Result { - let text = value.serialize(ConvertToString)?; - self.serialize_str(&text) - } - - fn serialize_str(self, text: &str) -> Result { - let number_of_words = self - .indexer - .index_text(self.document_id, self.pos, text); - Ok(Some(number_of_words)) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnindexableType { type_name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Ok(None) - } - - fn serialize_some(self, value: &T) -> Result - where - T: ser::Serialize, - { - let text = value.serialize(ConvertToString)?; - let number_of_words = self - .indexer - .index_text(self.document_id, self.pos, &text); - Ok(Some(number_of_words)) - } - - fn serialize_unit(self) -> Result { - Ok(None) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Ok(None) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - ) -> Result { - Ok(None) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T, - ) -> Result - where - T: ser::Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T, - ) -> Result - where - T: ser::Serialize, - { - Err(SerializerError::UnindexableType { - type_name: "newtype variant", - }) - } - - fn serialize_seq(self, _len: Option) -> Result { - let indexer = SeqIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_tuple(self, _len: usize) -> Result { - let indexer = TupleIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnindexableType { - type_name: "tuple struct", - }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnindexableType { - type_name: "tuple variant", - }) - } - - fn serialize_map(self, _len: Option) -> Result { - let indexer = MapIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize, - ) -> Result { - let indexer = StructIndexer { - pos: self.pos, - document_id: self.document_id, - indexer: self.indexer, - texts: Vec::new(), - }; - - Ok(indexer) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize, - ) -> Result { - Err(SerializerError::UnindexableType { - type_name: "struct variant", - }) - } -} - -pub struct SeqIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeSeq for SeqIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = value.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} - -pub struct MapIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeMap for MapIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = key.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let text = value.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} - -pub struct StructIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeStruct for StructIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T, - ) -> Result<(), Self::Error> - where - T: ser::Serialize, - { - let key_text = key.to_owned(); - let value_text = value.serialize(ConvertToString)?; - self.texts.push(key_text); - self.texts.push(value_text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} - -pub struct TupleIndexer<'a> { - pos: IndexedPos, - document_id: DocumentId, - indexer: &'a mut RawIndexer, - texts: Vec, -} - -impl<'a> ser::SerializeTuple for TupleIndexer<'a> { - type Ok = Option; - type Error = SerializerError; - - fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> - where - T: Serialize, - { - let text = value.serialize(ConvertToString)?; - self.texts.push(text); - Ok(()) - } - - fn end(self) -> Result { - let texts = self.texts.iter().map(String::as_str); - self.indexer - .index_text_seq(self.document_id, self.pos, texts); - Ok(None) - } -} diff --git a/meilisearch-core/src/serde/mod.rs b/meilisearch-core/src/serde/mod.rs index 9cb8e50bc..6a1e51a09 100644 --- a/meilisearch-core/src/serde/mod.rs +++ b/meilisearch-core/src/serde/mod.rs @@ -12,13 +12,11 @@ mod convert_to_number; mod convert_to_string; mod deserializer; mod extract_document_id; -mod indexer; pub use self::convert_to_number::ConvertToNumber; pub use self::convert_to_string::ConvertToString; pub use self::deserializer::{Deserializer, DeserializerError}; pub use self::extract_document_id::{compute_document_id, extract_document_id, value_to_string}; -pub use self::indexer::Indexer; use std::{error::Error, fmt}; diff --git a/meilisearch-core/src/update/documents_addition.rs b/meilisearch-core/src/update/documents_addition.rs index c65cf6e81..b7e6395af 100644 --- a/meilisearch-core/src/update/documents_addition.rs +++ b/meilisearch-core/src/update/documents_addition.rs @@ -1,4 +1,6 @@ use std::collections::HashMap; +use std::fmt::Write as _; +use std::fmt; use fst::{set::OpBuilder, SetBuilder}; use indexmap::IndexMap; @@ -6,12 +8,15 @@ use sdset::{duo::Union, SetOperation}; use serde::{Deserialize, Serialize}; use serde_json::Value; +use meilisearch_types::DocumentId; +use meilisearch_schema::IndexedPos; + use crate::database::{MainT, UpdateT}; use crate::database::{UpdateEvent, UpdateEventsEmitter}; use crate::facets; use crate::raw_indexer::RawIndexer; use crate::serde::{extract_document_id, Deserializer}; -use crate::serde::{ConvertToNumber, Indexer}; +use crate::serde::ConvertToNumber; use crate::store; use crate::update::{apply_documents_deletion, compute_short_prefixes, next_update_id, Update}; use crate::{Error, MResult, RankedMap}; @@ -106,6 +111,69 @@ pub fn push_documents_addition( Ok(last_update_id) } +// TODO move this helper functions elsewhere +/// Returns the number of words indexed or `None` if the type +fn index_value( + indexer: &mut RawIndexer, + document_id: DocumentId, + indexed_pos: IndexedPos, + value: &Value, +) -> Option +{ + fn value_to_string(string: &mut String, value: &Value) { + match value { + Value::Null => (), + Value::Bool(boolean) => { let _ = write!(string, "{}", &boolean); }, + Value::Number(number) => { let _ = write!(string, "{}", &number); }, + Value::String(text) => string.push_str(&text), + Value::Array(array) => { + for value in array { + value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + Value::Object(object) => { + for (key, value) in object { + string.push_str(key); + let _ = string.write_str(". "); + value_to_string(string, value); + let _ = string.write_str(". "); + } + }, + } + } + + match value { + Value::Null => None, + Value::Bool(boolean) => { + let text = boolean.to_string(); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + Value::Number(number) => { + let text = number.to_string(); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + Value::String(string) => { + let number_of_words = indexer.index_text(document_id, indexed_pos, &string); + Some(number_of_words) + }, + Value::Array(_) => { + let mut text = String::new(); + value_to_string(&mut text, value); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + Value::Object(_) => { + let mut text = String::new(); + value_to_string(&mut text, value); + let number_of_words = indexer.index_text(document_id, indexed_pos, &text); + Some(number_of_words) + }, + } +} + pub fn apply_addition<'a, 'b>( writer: &'a mut heed::RwTxn<'b, MainT>, index: &store::Index, @@ -183,8 +251,8 @@ pub fn apply_addition<'a, 'b>( index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; if let Some(indexed_pos) = schema.is_indexed(field_id) { - let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; - if let Some(number_of_words) = value.serialize(indexer)? { + let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value); + if let Some(number_of_words) = number_of_words { index.documents_fields_counts.put_document_field_count( writer, document_id, @@ -280,8 +348,8 @@ pub fn reindex_all_documents(writer: &mut heed::RwTxn, index: &store::Ind index.documents_fields.put_document_field(writer, document_id, field_id, &serialized)?; if let Some(indexed_pos) = schema.is_indexed(field_id) { - let indexer = Indexer { pos: *indexed_pos, indexer: &mut indexer, document_id }; - if let Some(number_of_words) = value.serialize(indexer)? { + let number_of_words = index_value(&mut indexer, document_id, *indexed_pos, &value); + if let Some(number_of_words) = number_of_words { index.documents_fields_counts.put_document_field_count( writer, document_id,