From a5bfbf244c9009acb338d43e1fea8ac723d8cc0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Thu, 3 Oct 2019 11:49:13 +0200 Subject: [PATCH] Introduce the documents Deserializer --- Cargo.toml | 15 ++ src/lib.rs | 10 +- src/main.rs | 2 +- src/number.rs | 55 +++++ src/query_builder.rs | 2 +- src/ranked_map.rs | 35 ++++ src/serde/convert_to_number.rs | 180 +++++++++++++++++ src/serde/convert_to_string.rs | 176 ++++++++++++++++ src/serde/deserializer.rs | 142 +++++++++++++ src/serde/extract_document_id.rs | 273 +++++++++++++++++++++++++ src/serde/indexer.rs | 336 +++++++++++++++++++++++++++++++ src/serde/mod.rs | 130 ++++++++++++ src/serde/serializer.rs | 286 ++++++++++++++++++++++++++ src/store/documents_fields.rs | 64 ++++++ src/store/mod.rs | 68 ++++++- 15 files changed, 1760 insertions(+), 14 deletions(-) create mode 100644 src/number.rs create mode 100644 src/ranked_map.rs create mode 100644 src/serde/convert_to_number.rs create mode 100644 src/serde/convert_to_string.rs create mode 100644 src/serde/deserializer.rs create mode 100644 src/serde/extract_document_id.rs create mode 100644 src/serde/indexer.rs create mode 100644 src/serde/mod.rs create mode 100644 src/serde/serializer.rs create mode 100644 src/store/documents_fields.rs diff --git a/Cargo.toml b/Cargo.toml index c04eb5170..7ebbd6a43 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -5,17 +5,32 @@ authors = ["Clément Renault "] edition = "2018" [dependencies] +bincode = "1.1.4" byteorder = "1.3.2" deunicode = "1.0.0" +hashbrown = { version = "0.6.0", features = ["serde"] } once_cell = "1.2.0" +ordered-float = { version = "1.0.2", features = ["serde"] } rkv = "0.10.2" sdset = "0.3.2" +serde = { version = "1.0.99", features = ["derive"] } +serde_json = "1.0.40" +siphasher = "0.3.0" slice-group-by = "0.2.6" zerocopy = "0.2.8" meilidb-schema = { path = "../MeiliDB/meilidb-schema", version = "0.1.0" } meilidb-tokenizer = { path = "../MeiliDB/meilidb-tokenizer", version = "0.1.0" } +[dependencies.rmp-serde] +git = "https://github.com/3Hren/msgpack-rust.git" +rev = "40b3d48" + +[dependencies.rmpv] +git = "https://github.com/3Hren/msgpack-rust.git" +rev = "40b3d48" +features = ["with-serde"] + [dependencies.levenshtein_automata] git = "https://github.com/Kerollmops/levenshtein-automata.git" branch = "arc-byte-slice" diff --git a/src/lib.rs b/src/lib.rs index a39159ce4..f9867e5d9 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,15 +1,22 @@ mod automaton; +mod number; mod query_builder; mod raw_document; mod reordered_attrs; +mod ranked_map; pub mod criterion; pub mod raw_indexer; +pub mod serde; pub mod store; pub use self::query_builder::QueryBuilder; pub use self::raw_document::RawDocument; +use self::number::{Number, ParseNumberError}; +use self::ranked_map::RankedMap; + use zerocopy::{AsBytes, FromBytes}; +use ::serde::{Serialize, Deserialize}; pub type BEI64 = zerocopy::I64; @@ -18,9 +25,10 @@ pub type BEI64 = zerocopy::I64; /// It is used to inform the database the document you want to deserialize. /// Helpful for custom ranking. #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +#[derive(Serialize, Deserialize)] #[derive(AsBytes, FromBytes)] #[repr(C)] -pub struct DocumentId(pub i64); +pub struct DocumentId(pub u64); /// This structure represent the position of a word /// in a document and its attributes. diff --git a/src/main.rs b/src/main.rs index 65cfce3ca..4d8d8ed63 100644 --- a/src/main.rs +++ b/src/main.rs @@ -16,7 +16,7 @@ fn main() { let created_arc = Manager::singleton().write().unwrap().get_or_create(path, Rkv::new).unwrap(); let env = created_arc.read().unwrap(); - let (words, synonyms) = store::create(&env, "test").unwrap(); + let (words, synonyms, documents_fields) = store::create(&env, "test").unwrap(); { let mut writer = env.write().unwrap(); diff --git a/src/number.rs b/src/number.rs new file mode 100644 index 000000000..5e64cc78f --- /dev/null +++ b/src/number.rs @@ -0,0 +1,55 @@ +use std::num::{ParseIntError, ParseFloatError}; +use std::str::FromStr; +use std::fmt; + +use ordered_float::OrderedFloat; +use serde::{Serialize, Deserialize}; + +#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Number { + Unsigned(u64), + Signed(i64), + Float(OrderedFloat), +} + +impl FromStr for Number { + type Err = ParseNumberError; + + fn from_str(s: &str) -> Result { + let uint_error = match u64::from_str(s) { + Ok(unsigned) => return Ok(Number::Unsigned(unsigned)), + Err(error) => error, + }; + + let int_error = match i64::from_str(s) { + Ok(signed) => return Ok(Number::Signed(signed)), + Err(error) => error, + }; + + let float_error = match f64::from_str(s) { + Ok(float) => return Ok(Number::Float(OrderedFloat(float))), + Err(error) => error, + }; + + Err(ParseNumberError { uint_error, int_error, float_error }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ParseNumberError { + uint_error: ParseIntError, + int_error: ParseIntError, + float_error: ParseFloatError, +} + +impl fmt::Display for ParseNumberError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.uint_error == self.int_error { + write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error) + } else { + write!(f, "can not parse number: {}, {}, {}", + self.uint_error, self.int_error, self.float_error) + } + } +} diff --git a/src/query_builder.rs b/src/query_builder.rs index 9cb91b755..0fb146825 100644 --- a/src/query_builder.rs +++ b/src/query_builder.rs @@ -1,6 +1,6 @@ use std::time::{Instant, Duration}; use std::ops::Range; -use std::{cmp, mem}; +use std::mem; use fst::{IntoStreamer, Streamer}; use sdset::SetBuf; diff --git a/src/ranked_map.rs b/src/ranked_map.rs new file mode 100644 index 000000000..0168883ff --- /dev/null +++ b/src/ranked_map.rs @@ -0,0 +1,35 @@ +use std::io::{Read, Write}; + +use hashbrown::HashMap; +use meilidb_schema::SchemaAttr; + +use crate::{DocumentId, Number}; + +#[derive(Debug, Default, Clone, PartialEq, Eq)] +pub struct RankedMap(HashMap<(DocumentId, SchemaAttr), Number>); + +impl RankedMap { + pub fn len(&self) -> usize { + self.0.len() + } + + pub fn insert(&mut self, document: DocumentId, attribute: SchemaAttr, number: Number) { + self.0.insert((document, attribute), number); + } + + pub fn remove(&mut self, document: DocumentId, attribute: SchemaAttr) { + self.0.remove(&(document, attribute)); + } + + pub fn get(&self, document: DocumentId, attribute: SchemaAttr) -> Option { + self.0.get(&(document, attribute)).cloned() + } + + pub fn read_from_bin(reader: R) -> bincode::Result { + bincode::deserialize_from(reader).map(RankedMap) + } + + pub fn write_to_bin(&self, writer: W) -> bincode::Result<()> { + bincode::serialize_into(writer, &self.0) + } +} diff --git a/src/serde/convert_to_number.rs b/src/serde/convert_to_number.rs new file mode 100644 index 000000000..aec22730a --- /dev/null +++ b/src/serde/convert_to_number.rs @@ -0,0 +1,180 @@ +use std::str::FromStr; + +use ordered_float::OrderedFloat; +use serde::ser; +use serde::Serialize; + +use super::SerializerError; +use crate::Number; + +pub struct ConvertToNumber; + +impl ser::Serializer for ConvertToNumber { + type Ok = Number; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + fn serialize_bool(self, value: bool) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_char(self, _value: char) -> Result { + Err(SerializerError::UnrankableType { type_name: "char" }) + } + + fn serialize_i8(self, value: i8) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i16(self, value: i16) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i32(self, value: i32) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i64(self, value: i64) -> Result { + Ok(Number::Signed(value)) + } + + fn serialize_u8(self, value: u8) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u16(self, value: u16) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u32(self, value: u32) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u64(self, value: u64) -> Result { + Ok(Number::Unsigned(value)) + } + + fn serialize_f32(self, value: f32) -> Result { + Ok(Number::Float(OrderedFloat(f64::from(value)))) + } + + fn serialize_f64(self, value: f64) -> Result { + Ok(Number::Float(OrderedFloat(value))) + } + + fn serialize_str(self, value: &str) -> Result { + Ok(Number::from_str(value)?) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnrankableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnrankableType { type_name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnrankableType { type_name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnrankableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnrankableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnrankableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnrankableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnrankableType { type_name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnrankableType { type_name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnrankableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnrankableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnrankableType { type_name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnrankableType { type_name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnrankableType { type_name: "struct variant" }) + } +} diff --git a/src/serde/convert_to_string.rs b/src/serde/convert_to_string.rs new file mode 100644 index 000000000..cd109f534 --- /dev/null +++ b/src/serde/convert_to_string.rs @@ -0,0 +1,176 @@ +use serde::Serialize; +use serde::ser; + +use super::SerializerError; + +pub struct ConvertToString; + +impl ser::Serializer for ConvertToString { + type Ok = String; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + fn serialize_bool(self, _value: bool) -> Result { + Err(SerializerError::UnserializableType { type_name: "boolean" }) + } + + fn serialize_char(self, value: char) -> Result { + Ok(value.to_string()) + } + + fn serialize_i8(self, value: i8) -> Result { + Ok(value.to_string()) + } + + fn serialize_i16(self, value: i16) -> Result { + Ok(value.to_string()) + } + + fn serialize_i32(self, value: i32) -> Result { + Ok(value.to_string()) + } + + fn serialize_i64(self, value: i64) -> Result { + Ok(value.to_string()) + } + + fn serialize_u8(self, value: u8) -> Result { + Ok(value.to_string()) + } + + fn serialize_u16(self, value: u16) -> Result { + Ok(value.to_string()) + } + + fn serialize_u32(self, value: u32) -> Result { + Ok(value.to_string()) + } + + fn serialize_u64(self, value: u64) -> Result { + Ok(value.to_string()) + } + + fn serialize_f32(self, value: f32) -> Result { + Ok(value.to_string()) + } + + fn serialize_f64(self, value: f64) -> Result { + Ok(value.to_string()) + } + + fn serialize_str(self, value: &str) -> Result { + Ok(value.to_string()) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { type_name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { type_name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { type_name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "struct variant" }) + } +} diff --git a/src/serde/deserializer.rs b/src/serde/deserializer.rs new file mode 100644 index 000000000..dda13892c --- /dev/null +++ b/src/serde/deserializer.rs @@ -0,0 +1,142 @@ +use std::collections::HashSet; +use std::io::Cursor; +use std::{fmt, error::Error}; + +use meilidb_schema::{Schema, SchemaAttr}; +use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader}; +use rmp_serde::decode::{Error as RmpError}; +use serde::{de, forward_to_deserialize_any}; + +use crate::store::DocumentsFields; +use crate::DocumentId; + +#[derive(Debug)] +pub enum DeserializerError { + RmpError(RmpError), + RkvError(rkv::StoreError), + Custom(String), +} + +impl de::Error for DeserializerError { + fn custom(msg: T) -> Self { + DeserializerError::Custom(msg.to_string()) + } +} + +impl fmt::Display for DeserializerError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + DeserializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), + DeserializerError::RkvError(e) => write!(f, "rkv related error: {}", e), + DeserializerError::Custom(s) => f.write_str(s), + } + } +} + +impl Error for DeserializerError {} + +impl From for DeserializerError { + fn from(error: RmpError) -> DeserializerError { + DeserializerError::RmpError(error) + } +} + +impl From for DeserializerError { + fn from(error: rkv::StoreError) -> DeserializerError { + DeserializerError::RkvError(error) + } +} + +pub struct Deserializer<'a, R> { + pub document_id: DocumentId, + pub reader: &'a R, + pub documents_fields: DocumentsFields, + pub schema: &'a Schema, + pub fields: Option<&'a HashSet>, +} + +impl<'de, 'a, 'b, R: 'a> de::Deserializer<'de> for &'b mut Deserializer<'a, R> +where R: rkv::Readable, +{ + type Error = DeserializerError; + + fn deserialize_any(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.deserialize_map(visitor) + } + + forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct struct enum identifier ignored_any + } + + fn deserialize_map(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + let mut error = None; + + let iter = self.documents_fields + .document_fields(self.reader, self.document_id)? + .filter_map(|result| { + let (attr, value) = match result { + Ok(value) => value, + Err(e) => { error = Some(e); return None }, + }; + + let is_displayed = self.schema.props(attr).is_displayed(); + if is_displayed && self.fields.map_or(true, |f| f.contains(&attr)) { + let attribute_name = self.schema.attribute_name(attr); + Some((attribute_name, Value::new(value))) + } else { + None + } + }); + + let map_deserializer = de::value::MapDeserializer::new(iter); + let result = visitor.visit_map(map_deserializer).map_err(DeserializerError::from); + + match error.take() { + Some(error) => Err(error.into()), + None => result, + } + } +} + +struct Value(RmpDeserializer>>) where A: AsRef<[u8]>; + +impl Value where A: AsRef<[u8]> +{ + fn new(value: A) -> Value { + Value(RmpDeserializer::new(Cursor::new(value))) + } +} + +impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value +where A: AsRef<[u8]>, +{ + type Deserializer = Self; + + fn into_deserializer(self) -> Self::Deserializer { + self + } +} + +impl<'de, 'a, A> de::Deserializer<'de> for Value +where A: AsRef<[u8]>, +{ + type Error = RmpError; + + fn deserialize_any(mut self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.0.deserialize_any(visitor) + } + + forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct map struct enum identifier ignored_any + } +} diff --git a/src/serde/extract_document_id.rs b/src/serde/extract_document_id.rs new file mode 100644 index 000000000..da90101e2 --- /dev/null +++ b/src/serde/extract_document_id.rs @@ -0,0 +1,273 @@ +use std::hash::{Hash, Hasher}; + +use crate::DocumentId; +use serde::{ser, Serialize}; +use serde_json::Value; +use siphasher::sip::SipHasher; + +use super::{SerializerError, ConvertToString}; + +pub fn extract_document_id( + identifier: &str, + document: &D, +) -> Result, SerializerError> +where D: serde::Serialize, +{ + let serializer = ExtractDocumentId { identifier }; + document.serialize(serializer) +} + +pub fn value_to_string(value: &Value) -> Option { + match value { + Value::Null => None, + Value::Bool(_) => None, + Value::Number(value) => Some(value.to_string()), + Value::String(value) => Some(value.to_string()), + Value::Array(_) => None, + Value::Object(_) => None, + } +} + +pub fn compute_document_id(t: H) -> DocumentId { + let mut s = SipHasher::new(); + t.hash(&mut s); + let hash = s.finish(); + DocumentId(hash) +} + +struct ExtractDocumentId<'a> { + identifier: &'a str, +} + +impl<'a> ser::Serializer for ExtractDocumentId<'a> { + type Ok = Option; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ExtractDocumentIdMapSerializer<'a>; + type SerializeStruct = ExtractDocumentIdStructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, _value: &str) -> Result { + Err(SerializerError::UnserializableType { type_name: "str" }) + } + + fn serialize_bytes(self, _value: &[u8]) -> Result { + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { type_name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { type_name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + let serializer = ExtractDocumentIdMapSerializer { + identifier: self.identifier, + document_id: None, + current_key_name: None, + }; + + Ok(serializer) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + let serializer = ExtractDocumentIdStructSerializer { + identifier: self.identifier, + document_id: None, + }; + + Ok(serializer) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "struct variant" }) + } +} + +pub struct ExtractDocumentIdMapSerializer<'a> { + identifier: &'a str, + document_id: Option, + current_key_name: Option, +} + +impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { + type Ok = Option; + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = key.serialize(ConvertToString)?; + self.current_key_name = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = self.current_key_name.take().unwrap(); + self.serialize_entry(&key, value) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V + ) -> Result<(), Self::Error> + where K: Serialize, V: Serialize, + { + let key = key.serialize(ConvertToString)?; + + if self.identifier == key { + let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?; + match value_to_string(&value).map(|s| compute_document_id(&s)) { + Some(document_id) => self.document_id = Some(document_id), + None => return Err(SerializerError::InvalidDocumentIdType), + } + } + + Ok(()) + } + + fn end(self) -> Result { + Ok(self.document_id) + } +} + +pub struct ExtractDocumentIdStructSerializer<'a> { + identifier: &'a str, + document_id: Option, +} + +impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> { + type Ok = Option; + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T + ) -> Result<(), Self::Error> + where T: Serialize, + { + if self.identifier == key { + let value = serde_json::to_string(value).and_then(|s| serde_json::from_str(&s))?; + match value_to_string(&value).map(compute_document_id) { + Some(document_id) => self.document_id = Some(document_id), + None => return Err(SerializerError::InvalidDocumentIdType), + } + } + + Ok(()) + } + + fn end(self) -> Result { + Ok(self.document_id) + } +} diff --git a/src/serde/indexer.rs b/src/serde/indexer.rs new file mode 100644 index 000000000..69a7ddecf --- /dev/null +++ b/src/serde/indexer.rs @@ -0,0 +1,336 @@ +use meilidb_schema::SchemaAttr; +use serde::ser; +use serde::Serialize; + +use crate::DocumentId; +use crate::raw_indexer::RawIndexer; +use super::{SerializerError, ConvertToString}; + +pub struct Indexer<'a> { + pub attribute: SchemaAttr, + pub indexer: &'a mut RawIndexer, + pub document_id: DocumentId, +} + +impl<'a> ser::Serializer for Indexer<'a> { + type Ok = (); + type Error = SerializerError; + type SerializeSeq = SeqIndexer<'a>; + type SerializeTuple = TupleIndexer<'a>; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = MapIndexer<'a>; + type SerializeStruct = StructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + fn serialize_bool(self, _value: bool) -> Result { + Err(SerializerError::UnindexableType { type_name: "boolean" }) + } + + fn serialize_char(self, value: char) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i8(self, value: i8) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i16(self, value: i16) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i32(self, value: i32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i64(self, value: i64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u8(self, value: u8) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u16(self, value: u16) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u32(self, value: u32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u64(self, value: u64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_f32(self, value: f32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_f64(self, value: f64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_str(self, text: &str) -> Result { + self.indexer.index_text(self.document_id, self.attribute, text); + Ok(()) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnindexableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnindexableType { type_name: "Option" }) + } + + fn serialize_some(self, value: &T) -> Result + where T: ser::Serialize, + { + let text = value.serialize(ConvertToString)?; + self.indexer.index_text(self.document_id, self.attribute, &text); + Ok(()) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnindexableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnindexableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: ser::Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnindexableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + let indexer = SeqIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) + } + + fn serialize_tuple(self, _len: usize) -> Result { + let indexer = TupleIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + let indexer = MapIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "struct variant" }) + } +} + +pub struct SeqIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeSeq for SeqIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct MapIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeMap for MapIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let text = key.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct StructSerializer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeStruct for StructSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key_text = key.to_owned(); + let value_text = value.serialize(ConvertToString)?; + self.texts.push(key_text); + self.texts.push(value_text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct TupleIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeTuple for TupleIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where T: Serialize + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} diff --git a/src/serde/mod.rs b/src/serde/mod.rs new file mode 100644 index 000000000..52c37e4f6 --- /dev/null +++ b/src/serde/mod.rs @@ -0,0 +1,130 @@ +macro_rules! forward_to_unserializable_type { + ($($ty:ident => $se_method:ident,)*) => { + $( + fn $se_method(self, _v: $ty) -> Result { + Err(SerializerError::UnserializableType { type_name: "$ty" }) + } + )* + } +} + +mod convert_to_number; +mod convert_to_string; +mod deserializer; +mod extract_document_id; +mod indexer; +mod serializer; + +pub use self::deserializer::{Deserializer, DeserializerError}; +pub use self::extract_document_id::{extract_document_id, compute_document_id, value_to_string}; +pub use self::convert_to_string::ConvertToString; +pub use self::convert_to_number::ConvertToNumber; +pub use self::indexer::Indexer; +pub use self::serializer::Serializer; + +use std::collections::BTreeMap; +use std::{fmt, error::Error}; + +use meilidb_schema::SchemaAttr; +use rmp_serde::encode::Error as RmpError; +use serde_json::Error as SerdeJsonError; +use serde::ser; + +use crate::{DocumentId, ParseNumberError}; + +#[derive(Debug)] +pub enum SerializerError { + DocumentIdNotFound, + InvalidDocumentIdType, + RmpError(RmpError), + RkvError(rkv::StoreError), + SerdeJsonError(SerdeJsonError), + ParseNumberError(ParseNumberError), + UnserializableType { type_name: &'static str }, + UnindexableType { type_name: &'static str }, + UnrankableType { type_name: &'static str }, + Custom(String), +} + +impl ser::Error for SerializerError { + fn custom(msg: T) -> Self { + SerializerError::Custom(msg.to_string()) + } +} + +impl fmt::Display for SerializerError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + SerializerError::DocumentIdNotFound => { + write!(f, "serialized document does not have an id according to the schema") + }, + SerializerError::InvalidDocumentIdType => { + write!(f, "document identifier can only be of type string or number") + }, + SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), + SerializerError::RkvError(e) => write!(f, "rkv related error: {}", e), + SerializerError::SerdeJsonError(e) => write!(f, "serde json error: {}", e), + SerializerError::ParseNumberError(e) => { + write!(f, "error while trying to parse a number: {}", e) + }, + SerializerError::UnserializableType { type_name } => { + write!(f, "{} are not a serializable type", type_name) + }, + SerializerError::UnindexableType { type_name } => { + write!(f, "{} are not an indexable type", type_name) + }, + SerializerError::UnrankableType { type_name } => { + write!(f, "{} types can not be used for ranking", type_name) + }, + SerializerError::Custom(s) => f.write_str(s), + } + } +} + +impl Error for SerializerError {} + +impl From for SerializerError { + fn from(value: String) -> SerializerError { + SerializerError::Custom(value) + } +} + +impl From for SerializerError { + fn from(error: RmpError) -> SerializerError { + SerializerError::RmpError(error) + } +} + +impl From for SerializerError { + fn from(error: SerdeJsonError) -> SerializerError { + SerializerError::SerdeJsonError(error) + } +} + +impl From for SerializerError { + fn from(error: rkv::StoreError) -> SerializerError { + SerializerError::RkvError(error) + } +} + +impl From for SerializerError { + fn from(error: ParseNumberError) -> SerializerError { + SerializerError::ParseNumberError(error) + } +} + +pub struct RamDocumentStore(BTreeMap<(DocumentId, SchemaAttr), Vec>); + +impl RamDocumentStore { + pub fn new() -> RamDocumentStore { + RamDocumentStore(BTreeMap::new()) + } + + pub fn set_document_field(&mut self, id: DocumentId, attr: SchemaAttr, value: Vec) { + self.0.insert((id, attr), value); + } + + pub fn into_inner(self) -> BTreeMap<(DocumentId, SchemaAttr), Vec> { + self.0 + } +} diff --git a/src/serde/serializer.rs b/src/serde/serializer.rs new file mode 100644 index 000000000..aaade76ba --- /dev/null +++ b/src/serde/serializer.rs @@ -0,0 +1,286 @@ +use meilidb_schema::Schema; +use serde::ser; + +use crate::{DocumentId, RankedMap}; +use crate::raw_indexer::RawIndexer; +use super::{RamDocumentStore, SerializerError, ConvertToString, ConvertToNumber, Indexer}; + +pub struct Serializer<'a> { + pub schema: &'a Schema, + pub document_store: &'a mut RamDocumentStore, + pub indexer: &'a mut RawIndexer, + pub ranked_map: &'a mut RankedMap, + pub document_id: DocumentId, +} + +impl<'a> ser::Serializer for Serializer<'a> { + type Ok = (); + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = MapSerializer<'a>; + type SerializeStruct = StructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, _v: &str) -> Result { + Err(SerializerError::UnserializableType { type_name: "str" }) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: ser::Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { type_name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { type_name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Ok(MapSerializer { + schema: self.schema, + document_id: self.document_id, + document_store: self.document_store, + indexer: self.indexer, + ranked_map: self.ranked_map, + current_key_name: None, + }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Ok(StructSerializer { + schema: self.schema, + document_id: self.document_id, + document_store: self.document_store, + indexer: self.indexer, + ranked_map: self.ranked_map, + }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "struct variant" }) + } +} + +pub struct MapSerializer<'a> { + schema: &'a Schema, + document_id: DocumentId, + document_store: &'a mut RamDocumentStore, + indexer: &'a mut RawIndexer, + ranked_map: &'a mut RankedMap, + current_key_name: Option, +} + +impl<'a> ser::SerializeMap for MapSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key = key.serialize(ConvertToString)?; + self.current_key_name = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key = self.current_key_name.take().unwrap(); + self.serialize_entry(&key, value) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V, + ) -> Result<(), Self::Error> + where K: ser::Serialize, V: ser::Serialize, + { + let key = key.serialize(ConvertToString)?; + + serialize_value( + self.schema, + self.document_id, + self.document_store, + self.indexer, + self.ranked_map, + &key, + value, + ) + } + + fn end(self) -> Result { + Ok(()) + } +} + +pub struct StructSerializer<'a> { + schema: &'a Schema, + document_id: DocumentId, + document_store: &'a mut RamDocumentStore, + indexer: &'a mut RawIndexer, + ranked_map: &'a mut RankedMap, +} + +impl<'a> ser::SerializeStruct for StructSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where T: ser::Serialize, + { + serialize_value( + self.schema, + self.document_id, + self.document_store, + self.indexer, + self.ranked_map, + key, + value, + ) + } + + fn end(self) -> Result { + Ok(()) + } +} + +fn serialize_value( + schema: &Schema, + document_id: DocumentId, + document_store: &mut RamDocumentStore, + indexer: &mut RawIndexer, + ranked_map: &mut RankedMap, + key: &str, + value: &T, +) -> Result<(), SerializerError> +where T: ser::Serialize, +{ + if let Some(attribute) = schema.attribute(key) { + let props = schema.props(attribute); + + let serialized = rmp_serde::to_vec_named(value)?; + document_store.set_document_field(document_id, attribute, serialized); + + if props.is_indexed() { + let indexer = Indexer { attribute, indexer, document_id }; + value.serialize(indexer)?; + } + + if props.is_ranked() { + let number = value.serialize(ConvertToNumber)?; + ranked_map.insert(document_id, attribute, number); + } + } + + Ok(()) +} diff --git a/src/store/documents_fields.rs b/src/store/documents_fields.rs new file mode 100644 index 000000000..38f7a4256 --- /dev/null +++ b/src/store/documents_fields.rs @@ -0,0 +1,64 @@ +use std::convert::TryFrom; +use meilidb_schema::SchemaAttr; +use crate::DocumentId; + +pub struct DocumentsFields { + pub(crate) documents_fields: rkv::SingleStore, +} + +impl DocumentsFields { + pub fn del_all_document_fields( + &mut self, + writer: &mut rkv::Writer, + document_id: DocumentId, + ) -> Result<(), rkv::StoreError> + { + unimplemented!() + } + + pub fn document_field( + &self, + reader: &T, + document_id: DocumentId, + attribute: SchemaAttr, + ) -> Result, rkv::StoreError> + { + unimplemented!() + } + + pub fn document_fields<'r, T: rkv::Readable>( + &self, + reader: &'r T, + document_id: DocumentId, + ) -> Result, rkv::StoreError> + { + let document_id_bytes = document_id.0.to_be_bytes(); + let iter = self.documents_fields.iter_from(reader, document_id_bytes)?; + Ok(DocumentFieldsIter { reader, document_id, iter }) + } +} + +pub struct DocumentFieldsIter<'r, T> { + reader: &'r T, + document_id: DocumentId, + iter: rkv::store::single::Iter<'r>, +} + +impl<'r, T: rkv::Readable + 'r> Iterator for DocumentFieldsIter<'r, T> { + type Item = Result<(SchemaAttr, &'r [u8]), rkv::StoreError>; + + fn next(&mut self) -> Option { + match self.iter.next() { + Some(Ok((key, Some(rkv::Value::Blob(bytes))))) => { + let bytes = key.get(8..8+2).unwrap(); + let array = <[u8; 2]>::try_from(bytes).unwrap(); + let attr = u16::from_be_bytes(array); + let attr = SchemaAttr::new(attr); + Some(Ok((attr, bytes))) + }, + Some(Ok((key, data))) => panic!("{:?}, {:?}", key, data), + Some(Err(e)) => Some(Err(e)), + None => None, + } + } +} diff --git a/src/store/mod.rs b/src/store/mod.rs index 9c6620484..903607fd1 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -1,26 +1,72 @@ -mod words; +mod documents_fields; mod synonyms; +mod words; -pub use self::words::Words; +pub use self::documents_fields::{DocumentsFields, DocumentFieldsIter}; pub use self::synonyms::Synonyms; +pub use self::words::Words; -const SCHEMA_KEY: &str = "schema"; -const WORDS_KEY: &str = "words"; -const SYNONYMS_KEY: &str = "synonyms"; -const RANKED_MAP_KEY: &str = "ranked-map"; const NUMBER_OF_DOCUMENTS_KEY: &str = "number-of-documents"; +const RANKED_MAP_KEY: &str = "ranked-map"; +const SCHEMA_KEY: &str = "schema"; +const SYNONYMS_KEY: &str = "synonyms"; +const WORDS_KEY: &str = "words"; fn aligned_to(bytes: &[u8], align: usize) -> bool { (bytes as *const _ as *const () as usize) % align == 0 } -pub fn create(env: &rkv::Rkv, name: &str) -> Result<(Words, Synonyms), rkv::StoreError> { - let main = env.open_single(name, rkv::StoreOptions::create())?; - let words_indexes = env.open_single(format!("{}-words-indexes", name).as_str(), rkv::StoreOptions::create())?; - let synonyms = env.open_single(format!("{}-synonyms", name).as_str(), rkv::StoreOptions::create())?; +fn words_indexes_name(name: &str) -> String { + format!("{}-words-indexes", name) +} + +fn synonyms_name(name: &str) -> String { + format!("{}-synonyms", name) +} + +fn documents_fields_name(name: &str) -> String { + format!("{}-documents-fields", name) +} + +pub fn create( + env: &rkv::Rkv, + name: &str, +) -> Result<(Words, Synonyms, DocumentsFields), rkv::StoreError> +{ + open_options(env, name, rkv::StoreOptions::create()) +} + +pub fn open( + env: &rkv::Rkv, + name: &str, +) -> Result<(Words, Synonyms, DocumentsFields), rkv::StoreError> +{ + let mut options = rkv::StoreOptions::default(); + options.create = false; + open_options(env, name, options) +} + +fn open_options( + env: &rkv::Rkv, + name: &str, + options: rkv::StoreOptions, +) -> Result<(Words, Synonyms, DocumentsFields), rkv::StoreError> +{ + // create all the database names + let main_name = name; + let words_indexes_name = words_indexes_name(name); + let synonyms_name = synonyms_name(name); + let documents_fields_name = documents_fields_name(name); + + // open all the database names + let main = env.open_single(main_name, options)?; + let words_indexes = env.open_single(words_indexes_name.as_str(), options)?; + let synonyms = env.open_single(synonyms_name.as_str(), options)?; + let documents_fields = env.open_single(documents_fields_name.as_str(), options)?; let words = Words { main, words_indexes }; let synonyms = Synonyms { main, synonyms }; + let documents_fields = DocumentsFields { documents_fields }; - Ok((words, synonyms)) + Ok((words, synonyms, documents_fields)) }