From 6bd779f9ae9ed82667c2f9251fdb2070b4bf91a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 30 Dec 2018 13:22:02 +0100 Subject: [PATCH 1/5] feat: Improve the deserialization time of a Blob --- src/data/doc_ids.rs | 48 +++++----- src/data/doc_indexes.rs | 17 ++-- src/data/mod.rs | 49 ++++------ src/database/blob/mod.rs | 108 ++++++++++------------ src/database/blob/negative/blob.rs | 42 ++++----- src/database/blob/negative/ops.rs | 2 +- src/database/blob/positive/blob.rs | 119 ++++++++++++------------- src/database/database.rs | 27 ++++-- src/database/mod.rs | 7 +- src/database/update/negative/update.rs | 3 +- src/database/update/positive/update.rs | 3 +- 11 files changed, 198 insertions(+), 227 deletions(-) diff --git a/src/data/doc_ids.rs b/src/data/doc_ids.rs index 11bc9fe75..21ab20466 100644 --- a/src/data/doc_ids.rs +++ b/src/data/doc_ids.rs @@ -1,43 +1,45 @@ use std::slice::from_raw_parts; -use std::error::Error; -use std::path::Path; use std::sync::Arc; use std::{io, mem}; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use sdset::Set; -use fst::raw::MmapReadOnly; -use serde::ser::{Serialize, Serializer}; use crate::DocumentId; -use crate::data::Data; +use crate::data::SharedData; #[derive(Default, Clone)] pub struct DocIds { - data: Data, + data: SharedData, } impl DocIds { - pub unsafe fn from_path>(path: P) -> io::Result { - let mmap = MmapReadOnly::open_path(path)?; - let data = Data::Mmap(mmap); - Ok(DocIds { data }) - } - - pub fn from_bytes(vec: Vec) -> Result> { - // FIXME check if modulo DocumentId + pub fn from_bytes(vec: Vec) -> io::Result { let len = vec.len(); - let data = Data::Shared { - bytes: Arc::new(vec), - offset: 0, - len: len - }; + DocIds::from_shared_bytes(Arc::new(vec), 0, len) + } + + pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> io::Result { + let data = SharedData { bytes, offset, len }; + DocIds::from_data(data) + } + + fn from_data(data: SharedData) -> io::Result { + let len = data.as_ref().read_u64::()?; + let data = data.range(mem::size_of::(), len as usize); Ok(DocIds { data }) } - pub fn from_document_ids(vec: Vec) -> Self { + pub fn from_raw(vec: Vec) -> Self { DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap() } + pub fn write_to_bytes(&self, bytes: &mut Vec) { + let len = self.data.len() as u64; + bytes.write_u64::(len).unwrap(); + bytes.extend_from_slice(&self.data); + } + pub fn contains(&self, doc: DocumentId) -> bool { // FIXME prefer using the sdset::exponential_search function self.doc_ids().binary_search(&doc).is_ok() @@ -51,9 +53,3 @@ impl DocIds { Set::new_unchecked(slice) } } - -impl Serialize for DocIds { - fn serialize(&self, serializer: S) -> Result { - self.data.as_ref().serialize(serializer) - } -} diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index ce466a85a..509e1ca2b 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -2,15 +2,13 @@ use std::slice::from_raw_parts; use std::io::{self, Write}; use std::mem::size_of; use std::ops::Index; -use std::path::Path; use std::sync::Arc; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use fst::raw::MmapReadOnly; use sdset::Set; use crate::DocIndex; -use crate::data::Data; +use crate::data::SharedData; #[derive(Debug)] #[repr(C)] @@ -21,27 +19,22 @@ struct Range { #[derive(Clone, Default)] pub struct DocIndexes { - ranges: Data, - indexes: Data, + ranges: SharedData, + indexes: SharedData, } impl DocIndexes { - pub unsafe fn from_path>(path: P) -> io::Result { - let mmap = MmapReadOnly::open_path(path)?; - DocIndexes::from_data(Data::Mmap(mmap)) - } - pub fn from_bytes(vec: Vec) -> io::Result { let len = vec.len(); DocIndexes::from_shared_bytes(Arc::new(vec), 0, len) } pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> io::Result { - let data = Data::Shared { bytes, offset, len }; + let data = SharedData { bytes, offset, len }; DocIndexes::from_data(data) } - fn from_data(data: Data) -> io::Result { + fn from_data(data: SharedData) -> io::Result { let ranges_len_offset = data.len() - size_of::(); let ranges_len = (&data[ranges_len_offset..]).read_u64::()?; let ranges_len = ranges_len as usize; diff --git a/src/data/mod.rs b/src/data/mod.rs index b4694493b..365f0353f 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -4,40 +4,30 @@ mod doc_indexes; use std::ops::Deref; use std::sync::Arc; -use fst::raw::MmapReadOnly; - pub use self::doc_ids::DocIds; pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; #[derive(Clone)] -enum Data { - Shared { - bytes: Arc>, - offset: usize, - len: usize, - }, - Mmap(MmapReadOnly), +struct SharedData { + bytes: Arc>, + offset: usize, + len: usize, } -impl Data { - pub fn range(&self, off: usize, l: usize) -> Data { - match self { - Data::Shared { bytes, offset, len } => { - assert!(off + l <= *len); - Data::Shared { - bytes: bytes.clone(), - offset: offset + off, - len: l, - } - }, - Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)), +impl SharedData { + pub fn range(&self, offset: usize, len: usize) -> SharedData { + assert!(offset + len <= self.len); + SharedData { + bytes: self.bytes.clone(), + offset: self.offset + offset, + len: len, } } } -impl Default for Data { - fn default() -> Data { - Data::Shared { +impl Default for SharedData { + fn default() -> SharedData { + SharedData { bytes: Arc::default(), offset: 0, len: 0, @@ -45,7 +35,7 @@ impl Default for Data { } } -impl Deref for Data { +impl Deref for SharedData { type Target = [u8]; fn deref(&self) -> &Self::Target { @@ -53,13 +43,8 @@ impl Deref for Data { } } -impl AsRef<[u8]> for Data { +impl AsRef<[u8]> for SharedData { fn as_ref(&self) -> &[u8] { - match self { - Data::Shared { bytes, offset, len } => { - &bytes[*offset..offset + len] - }, - Data::Mmap(m) => m.as_slice(), - } + &self.bytes[self.offset..self.offset + self.len] } } diff --git a/src/database/blob/mod.rs b/src/database/blob/mod.rs index d2c9d4253..b4fee637f 100644 --- a/src/database/blob/mod.rs +++ b/src/database/blob/mod.rs @@ -6,11 +6,11 @@ pub use self::positive::{PositiveBlob, PositiveBlobBuilder}; pub use self::negative::NegativeBlob; pub use self::ops::OpBuilder; -use std::fmt; +use std::io::{Cursor, BufRead}; +use std::error::Error; +use std::sync::Arc; -use serde_derive::{Serialize, Deserialize}; -use serde::ser::{Serialize, Serializer, SerializeTuple}; -use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor}; +use byteorder::{ReadBytesExt, WriteBytesExt}; #[derive(Debug)] pub enum Blob { @@ -33,68 +33,41 @@ impl Blob { Blob::Negative(_) => Sign::Negative, } } -} -impl Serialize for Blob { - fn serialize(&self, serializer: S) -> Result { + pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> Result> { + let mut cursor = Cursor::new(&bytes.as_slice()[..len]); + cursor.consume(offset); + + let byte = cursor.read_u8()?; + let blob = match Sign::from_byte(byte)? { + Sign::Positive => { + let offset = cursor.position() as usize; + let len = len - offset; + let blob = PositiveBlob::from_shared_bytes(bytes, offset, len)?; + Blob::Positive(blob) + }, + Sign::Negative => { + let offset = cursor.position() as usize; + let len = len - offset; + let blob = NegativeBlob::from_shared_bytes(bytes, offset, len)?; + Blob::Negative(blob) + }, + }; + + Ok(blob) + } + + pub fn write_to_bytes(&self, bytes: &mut Vec) { + let sign = self.sign(); + sign.write_to_bytes(bytes); match self { - Blob::Positive(blob) => { - let mut tuple = serializer.serialize_tuple(2)?; - tuple.serialize_element(&Sign::Positive)?; - tuple.serialize_element(&blob)?; - tuple.end() - }, - Blob::Negative(blob) => { - let mut tuple = serializer.serialize_tuple(2)?; - tuple.serialize_element(&Sign::Negative)?; - tuple.serialize_element(&blob)?; - tuple.end() - }, + Blob::Positive(b) => b.write_to_bytes(bytes), + Blob::Negative(b) => b.write_to_bytes(bytes), } } } -impl<'de> Deserialize<'de> for Blob { - fn deserialize>(deserializer: D) -> Result { - struct TupleVisitor; - - impl<'de> Visitor<'de> for TupleVisitor { - type Value = Blob; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - formatter.write_str("a Blob struct") - } - - #[inline] - fn visit_seq>(self, mut seq: A) -> Result { - let sign = match seq.next_element()? { - Some(value) => value, - None => return Err(de::Error::invalid_length(0, &self)), - }; - match sign { - Sign::Positive => { - let blob = match seq.next_element()? { - Some(value) => value, - None => return Err(de::Error::invalid_length(1, &self)), - }; - Ok(Blob::Positive(blob)) - }, - Sign::Negative => { - let blob = match seq.next_element()? { - Some(value) => value, - None => return Err(de::Error::invalid_length(1, &self)), - }; - Ok(Blob::Negative(blob)) - }, - } - } - } - - deserializer.deserialize_tuple(2, TupleVisitor) - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum Sign { Positive, Negative, @@ -107,4 +80,19 @@ impl Sign { Sign::Negative => Sign::Positive, } } + + pub fn from_byte(byte: u8) -> Result> { + match byte { + 0 => Ok(Sign::Positive), + 1 => Ok(Sign::Negative), + b => Err(format!("Invalid sign byte {:?}", b).into()), + } + } + + pub fn write_to_bytes(&self, bytes: &mut Vec) { + match self { + Sign::Positive => bytes.write_u8(0).unwrap(), + Sign::Negative => bytes.write_u8(1).unwrap(), + } + } } diff --git a/src/database/blob/negative/blob.rs b/src/database/blob/negative/blob.rs index 04b655b55..fba07d9fd 100644 --- a/src/database/blob/negative/blob.rs +++ b/src/database/blob/negative/blob.rs @@ -1,10 +1,11 @@ +use std::io::{Cursor, BufRead}; use std::error::Error; -use std::path::Path; +use std::sync::Arc; use std::fmt; use sdset::Set; -use serde::de::{self, Deserialize, Deserializer}; -use serde::ser::{Serialize, Serializer}; +use byteorder::{LittleEndian, ReadBytesExt}; + use crate::data::DocIds; use crate::DocumentId; @@ -14,18 +15,26 @@ pub struct NegativeBlob { } impl NegativeBlob { - pub unsafe fn from_path

(doc_ids: P) -> Result> - where P: AsRef, - { - let doc_ids = DocIds::from_path(doc_ids)?; - Ok(NegativeBlob { doc_ids }) - } - pub fn from_bytes(doc_ids: Vec) -> Result> { let doc_ids = DocIds::from_bytes(doc_ids)?; Ok(NegativeBlob { doc_ids }) } + pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> Result> { + let mut cursor = Cursor::new(&bytes.as_slice()[..len]); + cursor.consume(offset); + + let len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let doc_ids = DocIds::from_shared_bytes(bytes, offset, len)?; + + Ok(NegativeBlob::from_raw(doc_ids)) + } + + pub fn write_to_bytes(&self, bytes: &mut Vec) { + self.doc_ids.write_to_bytes(bytes) + } + pub fn from_raw(doc_ids: DocIds) -> Self { NegativeBlob { doc_ids } } @@ -52,16 +61,3 @@ impl fmt::Debug for NegativeBlob { write!(f, ")") } } - -impl Serialize for NegativeBlob { - fn serialize(&self, serializer: S) -> Result { - self.doc_ids.serialize(serializer) - } -} - -impl<'de> Deserialize<'de> for NegativeBlob { - fn deserialize>(deserializer: D) -> Result { - let bytes = Vec::deserialize(deserializer)?; - NegativeBlob::from_bytes(bytes).map_err(de::Error::custom) - } -} diff --git a/src/database/blob/negative/ops.rs b/src/database/blob/negative/ops.rs index bb3b783b8..ff3a548d2 100644 --- a/src/database/blob/negative/ops.rs +++ b/src/database/blob/negative/ops.rs @@ -60,7 +60,7 @@ impl<'a> $name<'a> { pub fn into_negative_blob(self) -> NegativeBlob { let document_ids = sdset::SetOperation::into_set_buf(self.op); - let doc_ids = DocIds::from_document_ids(document_ids.into_vec()); + let doc_ids = DocIds::from_raw(document_ids.into_vec()); NegativeBlob::from_raw(doc_ids) } } diff --git a/src/database/blob/positive/blob.rs b/src/database/blob/positive/blob.rs index df2e8497a..1a294a657 100644 --- a/src/database/blob/positive/blob.rs +++ b/src/database/blob/positive/blob.rs @@ -1,15 +1,16 @@ -use std::fmt; -use std::io::Write; -use std::path::Path; +use std::io::{Write, Cursor, BufRead}; +use std::convert::From; use std::error::Error; +use std::sync::Arc; +use std::fmt; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use fst::{map, Map, Streamer, IntoStreamer}; +use fst::raw::Fst; use sdset::Set; use crate::DocIndex; use crate::data::{DocIndexes, DocIndexesBuilder}; -use serde::ser::{Serialize, Serializer, SerializeTuple}; -use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor}; #[derive(Default)] pub struct PositiveBlob { @@ -18,15 +19,6 @@ pub struct PositiveBlob { } impl PositiveBlob { - pub unsafe fn from_paths(map: P, indexes: Q) -> Result> - where P: AsRef, - Q: AsRef, - { - let map = Map::from_path(map)?; - let indexes = DocIndexes::from_path(indexes)?; - Ok(PositiveBlob { map, indexes }) - } - pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { let map = Map::from_bytes(map)?; let indexes = DocIndexes::from_bytes(indexes)?; @@ -37,6 +29,33 @@ impl PositiveBlob { PositiveBlob { map, indexes } } + pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> Result> { + let mut cursor = Cursor::new(&bytes.as_slice()[..len]); + cursor.consume(offset); + + let map_len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let map = Map::from(Fst::from_shared_bytes(bytes.clone(), offset, map_len)?); + + cursor.consume(map_len); + + let doc_len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let doc_indexes = DocIndexes::from_shared_bytes(bytes, offset, doc_len)?; + + Ok(PositiveBlob::from_raw(map, doc_indexes)) + } + + pub fn write_to_bytes(&self, bytes: &mut Vec) { + let map_bytes = self.map.as_fst().as_bytes(); + bytes.write_u64::(map_bytes.len() as u64).unwrap(); + bytes.extend_from_slice(&map_bytes); + + let doc_indexes_vec = self.indexes.to_vec(); // FIXME patch to have a as_slice() function + bytes.write_u64::(doc_indexes_vec.len() as u64).unwrap(); + bytes.extend_from_slice(&doc_indexes_vec); + } + pub fn get>(&self, key: K) -> Option<&[DocIndex]> { self.map.get(key).map(|index| &self.indexes[index as usize]) } @@ -103,52 +122,6 @@ impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> { } } -impl Serialize for PositiveBlob { - fn serialize(&self, serializer: S) -> Result { - let mut tuple = serializer.serialize_tuple(2)?; - tuple.serialize_element(&self.map.as_fst().to_vec())?; - tuple.serialize_element(&self.indexes.to_vec())?; - tuple.end() - } -} - -impl<'de> Deserialize<'de> for PositiveBlob { - fn deserialize>(deserializer: D) -> Result { - struct TupleVisitor; - - impl<'de> Visitor<'de> for TupleVisitor { - type Value = PositiveBlob; - - fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { - formatter.write_str("a PositiveBlob struct") - } - - #[inline] - fn visit_seq>(self, mut seq: A) -> Result { - let map = match seq.next_element()? { - Some(bytes) => match Map::from_bytes(bytes) { - Ok(value) => value, - Err(err) => return Err(de::Error::custom(err)), - }, - None => return Err(de::Error::invalid_length(0, &self)), - }; - - let indexes = match seq.next_element()? { - Some(bytes) => match DocIndexes::from_bytes(bytes) { - Ok(value) => value, - Err(err) => return Err(de::Error::custom(err)), - }, - None => return Err(de::Error::invalid_length(1, &self)), - }; - - Ok(PositiveBlob { map, indexes }) - } - } - - deserializer.deserialize_tuple(2, TupleVisitor) - } -} - pub struct PositiveBlobBuilder { map: fst::MapBuilder, indexes: DocIndexesBuilder, @@ -207,6 +180,29 @@ mod tests { use crate::DocumentId; + #[test] + fn create_query() -> Result<(), Box> { + let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; + let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; + let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; + + let mut builder = PositiveBlobBuilder::memory(); + + builder.insert("aaa", Set::new(&[a])?)?; + builder.insert("aab", Set::new(&[a, b, c])?)?; + builder.insert("aac", Set::new(&[a, c])?)?; + + let (map_bytes, indexes_bytes) = builder.into_inner()?; + let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?; + + assert_eq!(positive_blob.get("aaa"), Some(&[a][..])); + assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..])); + assert_eq!(positive_blob.get("aac"), Some(&[a, c][..])); + assert_eq!(positive_blob.get("aad"), None); + + Ok(()) + } + #[test] fn serialize_deserialize() -> Result<(), Box> { let a = DocIndex { @@ -269,9 +265,6 @@ mod tests { let (map_bytes, indexes_bytes) = builder.into_inner()?; let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?; - let bytes = bincode::serialize(&positive_blob)?; - let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?; - assert_eq!(positive_blob.get("aaa"), Some(&[a][..])); assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..])); assert_eq!(positive_blob.get("aac"), Some(&[a, c][..])); diff --git a/src/database/database.rs b/src/database/database.rs index 507f9436b..37d95af1c 100644 --- a/src/database/database.rs +++ b/src/database/database.rs @@ -7,9 +7,9 @@ use rocksdb::rocksdb::{Writable, Snapshot}; use rocksdb::{DB, DBVector, MergeOperands}; use crossbeam::atomic::ArcCell; +use crate::database::blob::{self, Blob, PositiveBlob}; use crate::database::{DatabaseView, Update, Schema}; use crate::database::{DATA_INDEX, DATA_SCHEMA}; -use crate::database::blob::{self, Blob}; pub struct Database { // DB is under a Mutex to sync update ingestions and separate DB update locking @@ -136,18 +136,31 @@ fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut Merge }; let mut op = blob::OpBuilder::with_capacity(capacity); - if let Some(existing_value) = existing_value { - let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index"); + if let Some(bytes) = existing_value { + let bytes_len = bytes.len(); + let bytes = Arc::new(bytes.to_vec()); + let blob = match PositiveBlob::from_shared_bytes(bytes, 0, bytes_len) { + Ok(blob) => blob, + Err(e) => panic!("BUG: could not deserialize data-index due to {}", e), + }; op.push(Blob::Positive(blob)); } for bytes in operands { - let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob"); + let bytes_len = bytes.len(); + let bytes = Arc::new(bytes.to_vec()); + let blob = match Blob::from_shared_bytes(bytes, 0, bytes_len) { + Ok(blob) => blob, + Err(e) => panic!("BUG: could not deserialize blob due to {}", e), + }; op.push(blob); } let blob = op.merge().expect("BUG: could not merge blobs"); - bincode::serialize(&blob).expect("BUG: could not serialize merged blob") + + let mut bytes = Vec::new(); + blob.write_to_bytes(&mut bytes); + bytes } #[cfg(test)] @@ -158,9 +171,9 @@ mod tests { use serde_derive::{Serialize, Deserialize}; use tempfile::tempdir; - use crate::tokenizer::DefaultBuilder; - use crate::database::update::PositiveUpdateBuilder; use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; + use crate::database::update::PositiveUpdateBuilder; + use crate::tokenizer::DefaultBuilder; #[test] fn ingest_update_file() -> Result<(), Box> { diff --git a/src/database/mod.rs b/src/database/mod.rs index 829d9201b..6a776d239 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -2,6 +2,7 @@ use std::collections::hash_map::DefaultHasher; use std::hash::{Hash, Hasher}; use std::error::Error; use std::ops::Deref; +use std::sync::Arc; use rocksdb::rocksdb::{DB, Snapshot}; @@ -55,7 +56,11 @@ fn retrieve_data_index(snapshot: &Snapshot) -> Result { match snapshot.get(DATA_INDEX)? { - Some(vector) => Ok(bincode::deserialize(&*vector)?), + Some(vector) => { + let bytes_len = vector.as_ref().len(); + let bytes = Arc::new(vector.as_ref().to_vec()); + Ok(PositiveBlob::from_shared_bytes(bytes, 0, bytes_len)?) + }, None => Ok(PositiveBlob::default()), } } diff --git a/src/database/update/negative/update.rs b/src/database/update/negative/update.rs index 3d4c4d061..4b0c83784 100644 --- a/src/database/update/negative/update.rs +++ b/src/database/update/negative/update.rs @@ -38,7 +38,8 @@ impl NegativeUpdateBuilder { let blob = Blob::Negative(negative_blob); // write the data-index aka negative blob - let bytes = bincode::serialize(&blob)?; + let mut bytes = Vec::new(); + blob.write_to_bytes(&mut bytes); file_writer.merge(DATA_INDEX, &bytes)?; // FIXME remove this ugly thing ! diff --git a/src/database/update/positive/update.rs b/src/database/update/positive/update.rs index 244ef9e9a..6316a2e7f 100644 --- a/src/database/update/positive/update.rs +++ b/src/database/update/positive/update.rs @@ -485,7 +485,8 @@ impl PositiveUpdateBuilder { let blob = Blob::Positive(positive_blob); // write the data-index aka positive blob - let bytes = bincode::serialize(&blob)?; + let mut bytes = Vec::new(); + blob.write_to_bytes(&mut bytes); file_writer.merge(DATA_INDEX, &bytes)?; // write all the documents fields updates From 0080bf486f5c6c5326d2117ea0cc1f053ecebc7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sun, 30 Dec 2018 16:17:18 +0100 Subject: [PATCH 2/5] feat: Introduce the new Index structure replacing the old ugly Blob system --- examples/create-database.rs | 8 +- src/data/doc_ids.rs | 8 + src/data/doc_indexes.rs | 29 +- src/data/mod.rs | 14 +- src/database/blob/mod.rs | 98 ---- src/database/blob/negative/blob.rs | 63 --- src/database/blob/negative/mod.rs | 5 - src/database/blob/negative/ops.rs | 73 --- src/database/blob/ops.rs | 109 ---- src/database/blob/positive/blob.rs | 275 ---------- src/database/blob/positive/mod.rs | 5 - src/database/blob/positive/ops.rs | 128 ----- src/database/database.rs | 60 +-- src/database/database_view.rs | 12 +- src/database/index/mod.rs | 81 +++ src/database/index/negative.rs | 53 ++ src/database/index/positive.rs | 172 ++++++ src/database/mod.rs | 15 +- src/database/schema.rs | 194 +------ src/database/update/builder.rs | 95 ++++ src/database/update/mod.rs | 32 +- src/database/update/negative/mod.rs | 4 - .../update/negative/unordered_builder.rs | 37 -- src/database/update/negative/update.rs | 61 --- src/database/update/positive/mod.rs | 4 - .../update/positive/unordered_builder.rs | 49 -- src/database/update/positive/update.rs | 505 ------------------ src/rank/query_builder.rs | 4 +- 28 files changed, 481 insertions(+), 1712 deletions(-) delete mode 100644 src/database/blob/mod.rs delete mode 100644 src/database/blob/negative/blob.rs delete mode 100644 src/database/blob/negative/mod.rs delete mode 100644 src/database/blob/negative/ops.rs delete mode 100644 src/database/blob/ops.rs delete mode 100644 src/database/blob/positive/blob.rs delete mode 100644 src/database/blob/positive/mod.rs delete mode 100644 src/database/blob/positive/ops.rs create mode 100644 src/database/index/mod.rs create mode 100644 src/database/index/negative.rs create mode 100644 src/database/index/positive.rs create mode 100644 src/database/update/builder.rs delete mode 100644 src/database/update/negative/mod.rs delete mode 100644 src/database/update/negative/unordered_builder.rs delete mode 100644 src/database/update/negative/update.rs delete mode 100644 src/database/update/positive/mod.rs delete mode 100644 src/database/update/positive/unordered_builder.rs delete mode 100644 src/database/update/positive/update.rs diff --git a/examples/create-database.rs b/examples/create-database.rs index e1f1cb0b5..5c6e81d1d 100644 --- a/examples/create-database.rs +++ b/examples/create-database.rs @@ -5,7 +5,7 @@ use serde_derive::{Serialize, Deserialize}; use structopt::StructOpt; use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED}; -use meilidb::database::PositiveUpdateBuilder; +use meilidb::database::UpdateBuilder; use meilidb::tokenizer::DefaultBuilder; use meilidb::database::Database; @@ -44,7 +44,7 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result Result Self { + DocIds { data: SharedData::empty() } + } + pub fn from_bytes(vec: Vec) -> io::Result { let len = vec.len(); DocIds::from_shared_bytes(Arc::new(vec), 0, len) @@ -24,6 +28,10 @@ impl DocIds { DocIds::from_data(data) } + pub fn as_bytes(&self) -> &[u8] { + &self.data + } + fn from_data(data: SharedData) -> io::Result { let len = data.as_ref().read_u64::()?; let data = data.range(mem::size_of::(), len as usize); diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index 509e1ca2b..21627cb0d 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -24,17 +24,17 @@ pub struct DocIndexes { } impl DocIndexes { - pub fn from_bytes(vec: Vec) -> io::Result { + pub fn from_bytes(vec: Vec) -> io::Result { let len = vec.len(); DocIndexes::from_shared_bytes(Arc::new(vec), 0, len) } - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> io::Result { + pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> io::Result { let data = SharedData { bytes, offset, len }; DocIndexes::from_data(data) } - fn from_data(data: SharedData) -> io::Result { + fn from_data(data: SharedData) -> io::Result { let ranges_len_offset = data.len() - size_of::(); let ranges_len = (&data[ranges_len_offset..]).read_u64::()?; let ranges_len = ranges_len as usize; @@ -47,19 +47,21 @@ impl DocIndexes { Ok(DocIndexes { ranges, indexes }) } - pub fn to_vec(&self) -> Vec { - let capacity = self.indexes.len() + self.ranges.len() + size_of::(); - let mut bytes = Vec::with_capacity(capacity); + pub fn write_to_bytes(&self, bytes: &mut Vec) { + let ranges_len = self.ranges.len() as u64; + let indexes_len = self.indexes.len() as u64; + let u64_size = size_of::() as u64; + let len = indexes_len + ranges_len + u64_size; + + let _ = bytes.write_u64::(len); bytes.extend_from_slice(&self.indexes); bytes.extend_from_slice(&self.ranges); - bytes.write_u64::(self.ranges.len() as u64).unwrap(); - - bytes + let _ = bytes.write_u64::(ranges_len); } pub fn get(&self, index: usize) -> Option<&Set> { - self.ranges().get(index as usize).map(|Range { start, end }| { + self.ranges().get(index).map(|Range { start, end }| { let start = *start as usize; let end = *end as usize; let slice = &self.indexes()[start..end]; @@ -216,9 +218,12 @@ mod tests { let builder_bytes = builder.into_inner()?; let docs = DocIndexes::from_bytes(builder_bytes.clone())?; - let bytes = docs.to_vec(); - assert_eq!(builder_bytes, bytes); + let mut bytes = Vec::new(); + docs.write_to_bytes(&mut bytes); + let len = size_of::(); + + assert_eq!(builder_bytes, &bytes[len..]); Ok(()) } diff --git a/src/data/mod.rs b/src/data/mod.rs index 365f0353f..69888dfcf 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -15,6 +15,14 @@ struct SharedData { } impl SharedData { + pub fn empty() -> SharedData { + SharedData { + bytes: Arc::default(), + offset: 0, + len: 0, + } + } + pub fn range(&self, offset: usize, len: usize) -> SharedData { assert!(offset + len <= self.len); SharedData { @@ -27,11 +35,7 @@ impl SharedData { impl Default for SharedData { fn default() -> SharedData { - SharedData { - bytes: Arc::default(), - offset: 0, - len: 0, - } + SharedData::empty() } } diff --git a/src/database/blob/mod.rs b/src/database/blob/mod.rs deleted file mode 100644 index b4fee637f..000000000 --- a/src/database/blob/mod.rs +++ /dev/null @@ -1,98 +0,0 @@ -mod ops; -pub mod positive; -pub mod negative; - -pub use self::positive::{PositiveBlob, PositiveBlobBuilder}; -pub use self::negative::NegativeBlob; -pub use self::ops::OpBuilder; - -use std::io::{Cursor, BufRead}; -use std::error::Error; -use std::sync::Arc; - -use byteorder::{ReadBytesExt, WriteBytesExt}; - -#[derive(Debug)] -pub enum Blob { - Positive(PositiveBlob), - Negative(NegativeBlob), -} - -impl Blob { - pub fn is_negative(&self) -> bool { - self.sign() == Sign::Negative - } - - pub fn is_positive(&self) -> bool { - self.sign() == Sign::Positive - } - - pub fn sign(&self) -> Sign { - match self { - Blob::Positive(_) => Sign::Positive, - Blob::Negative(_) => Sign::Negative, - } - } - - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> Result> { - let mut cursor = Cursor::new(&bytes.as_slice()[..len]); - cursor.consume(offset); - - let byte = cursor.read_u8()?; - let blob = match Sign::from_byte(byte)? { - Sign::Positive => { - let offset = cursor.position() as usize; - let len = len - offset; - let blob = PositiveBlob::from_shared_bytes(bytes, offset, len)?; - Blob::Positive(blob) - }, - Sign::Negative => { - let offset = cursor.position() as usize; - let len = len - offset; - let blob = NegativeBlob::from_shared_bytes(bytes, offset, len)?; - Blob::Negative(blob) - }, - }; - - Ok(blob) - } - - pub fn write_to_bytes(&self, bytes: &mut Vec) { - let sign = self.sign(); - sign.write_to_bytes(bytes); - match self { - Blob::Positive(b) => b.write_to_bytes(bytes), - Blob::Negative(b) => b.write_to_bytes(bytes), - } - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub enum Sign { - Positive, - Negative, -} - -impl Sign { - pub fn invert(self) -> Sign { - match self { - Sign::Positive => Sign::Negative, - Sign::Negative => Sign::Positive, - } - } - - pub fn from_byte(byte: u8) -> Result> { - match byte { - 0 => Ok(Sign::Positive), - 1 => Ok(Sign::Negative), - b => Err(format!("Invalid sign byte {:?}", b).into()), - } - } - - pub fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - Sign::Positive => bytes.write_u8(0).unwrap(), - Sign::Negative => bytes.write_u8(1).unwrap(), - } - } -} diff --git a/src/database/blob/negative/blob.rs b/src/database/blob/negative/blob.rs deleted file mode 100644 index fba07d9fd..000000000 --- a/src/database/blob/negative/blob.rs +++ /dev/null @@ -1,63 +0,0 @@ -use std::io::{Cursor, BufRead}; -use std::error::Error; -use std::sync::Arc; -use std::fmt; - -use sdset::Set; -use byteorder::{LittleEndian, ReadBytesExt}; - -use crate::data::DocIds; -use crate::DocumentId; - -#[derive(Default)] -pub struct NegativeBlob { - doc_ids: DocIds, -} - -impl NegativeBlob { - pub fn from_bytes(doc_ids: Vec) -> Result> { - let doc_ids = DocIds::from_bytes(doc_ids)?; - Ok(NegativeBlob { doc_ids }) - } - - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> Result> { - let mut cursor = Cursor::new(&bytes.as_slice()[..len]); - cursor.consume(offset); - - let len = cursor.read_u64::()? as usize; - let offset = cursor.position() as usize; - let doc_ids = DocIds::from_shared_bytes(bytes, offset, len)?; - - Ok(NegativeBlob::from_raw(doc_ids)) - } - - pub fn write_to_bytes(&self, bytes: &mut Vec) { - self.doc_ids.write_to_bytes(bytes) - } - - pub fn from_raw(doc_ids: DocIds) -> Self { - NegativeBlob { doc_ids } - } - - pub fn as_ids(&self) -> &DocIds { - &self.doc_ids - } - - pub fn into_doc_ids(self) -> DocIds { - self.doc_ids - } -} - -impl AsRef> for NegativeBlob { - fn as_ref(&self) -> &Set { - self.as_ids().doc_ids() - } -} - -impl fmt::Debug for NegativeBlob { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "NegativeBlob(")?; - f.debug_list().entries(self.as_ref().as_slice()).finish()?; - write!(f, ")") - } -} diff --git a/src/database/blob/negative/mod.rs b/src/database/blob/negative/mod.rs deleted file mode 100644 index ce0000da0..000000000 --- a/src/database/blob/negative/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -mod blob; -mod ops; - -pub use self::blob::NegativeBlob; -pub use self::ops::OpBuilder; diff --git a/src/database/blob/negative/ops.rs b/src/database/blob/negative/ops.rs deleted file mode 100644 index ff3a548d2..000000000 --- a/src/database/blob/negative/ops.rs +++ /dev/null @@ -1,73 +0,0 @@ -use sdset::multi::OpBuilder as SdOpBuilder; -use sdset::Set; - -use crate::database::blob::NegativeBlob; -use crate::data::DocIds; -use crate::DocumentId; - -pub struct OpBuilder<'a> { - inner: SdOpBuilder<'a, DocumentId>, -} - -/// Do a set operation on multiple negative blobs. -impl<'a> OpBuilder<'a> { - pub fn new() -> Self { - Self { inner: SdOpBuilder::new() } - } - - pub fn with_capacity(cap: usize) -> Self { - Self { inner: SdOpBuilder::with_capacity(cap) } - } - - pub fn add(mut self, blob: &'a NegativeBlob) -> Self { - self.push(blob); - self - } - - pub fn push(&mut self, blob: &'a NegativeBlob) { - let set = Set::new_unchecked(blob.as_ref()); - self.inner.push(set); - } - - pub fn union(self) -> Union<'a> { - Union::new(self.inner.union()) - } - - pub fn intersection(self) -> Intersection<'a> { - Intersection::new(self.inner.intersection()) - } - - pub fn difference(self) -> Difference<'a> { - Difference::new(self.inner.difference()) - } - - pub fn symmetric_difference(self) -> SymmetricDifference<'a> { - SymmetricDifference::new(self.inner.symmetric_difference()) - } -} - -macro_rules! logical_operation { - (struct $name:ident, $operation:ident) => { - -pub struct $name<'a> { - op: sdset::multi::$name<'a, DocumentId>, -} - -impl<'a> $name<'a> { - fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self { - $name { op } - } - - pub fn into_negative_blob(self) -> NegativeBlob { - let document_ids = sdset::SetOperation::into_set_buf(self.op); - let doc_ids = DocIds::from_raw(document_ids.into_vec()); - NegativeBlob::from_raw(doc_ids) - } -} - -}} - -logical_operation!(struct Union, union); -logical_operation!(struct Intersection, intersection); -logical_operation!(struct Difference, difference); -logical_operation!(struct SymmetricDifference, symmetric_difference); diff --git a/src/database/blob/ops.rs b/src/database/blob/ops.rs deleted file mode 100644 index 0aeea037d..000000000 --- a/src/database/blob/ops.rs +++ /dev/null @@ -1,109 +0,0 @@ -use std::error::Error; - -use fst::{IntoStreamer, Streamer}; -use sdset::duo::DifferenceByKey; -use sdset::{Set, SetOperation}; -use group_by::GroupBy; - -use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob}; -use crate::database::blob::{positive, negative}; - -fn blob_same_sign(a: &Blob, b: &Blob) -> bool { - a.sign() == b.sign() -} - -fn unwrap_positive(blob: &Blob) -> &PositiveBlob { - match blob { - Blob::Positive(blob) => blob, - Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"), - } -} - -fn unwrap_negative(blob: &Blob) -> &NegativeBlob { - match blob { - Blob::Negative(blob) => blob, - Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"), - } -} - -pub struct OpBuilder { - blobs: Vec, -} - -impl OpBuilder { - pub fn new() -> OpBuilder { - OpBuilder { blobs: Vec::new() } - } - - pub fn with_capacity(cap: usize) -> OpBuilder { - OpBuilder { blobs: Vec::with_capacity(cap) } - } - - pub fn push(&mut self, blob: Blob) { - if self.blobs.is_empty() && blob.is_negative() { return } - self.blobs.push(blob); - } - - pub fn merge(self) -> Result> { - let groups = GroupBy::new(&self.blobs, blob_same_sign); - let mut aggregated = Vec::new(); - - for blobs in groups { - match blobs[0].sign() { - Sign::Positive => { - let mut op_builder = positive::OpBuilder::with_capacity(blobs.len()); - for blob in blobs { - op_builder.push(unwrap_positive(blob)); - } - - let mut stream = op_builder.union().into_stream(); - let mut builder = PositiveBlobBuilder::memory(); - while let Some((input, doc_indexes)) = stream.next() { - // FIXME empty doc_indexes must be handled by OpBuilder - if !doc_indexes.is_empty() { - builder.insert(input, doc_indexes).unwrap(); - } - } - let (map, doc_indexes) = builder.into_inner().unwrap(); - let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap(); - aggregated.push(Blob::Positive(blob)); - }, - Sign::Negative => { - let mut op_builder = negative::OpBuilder::with_capacity(blobs.len()); - for blob in blobs { - op_builder.push(unwrap_negative(blob)); - } - let blob = op_builder.union().into_negative_blob(); - aggregated.push(Blob::Negative(blob)); - }, - } - } - - let mut buffer = Vec::new(); - aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| { - let negative = NegativeBlob::default(); - let (positive, negative) = match slice { - [a, b] => (unwrap_positive(a), unwrap_negative(b)), - [a] => (unwrap_positive(a), &negative), - _ => unreachable!(), - }; - - let mut builder = PositiveBlobBuilder::memory(); - - let op_builder = positive::OpBuilder::new().add(&base).add(&positive); - let mut stream = op_builder.union().into_stream(); - while let Some((input, doc_indexes)) = stream.next() { - let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x); - - buffer.clear(); - op.extend_vec(&mut buffer); - if !buffer.is_empty() { - builder.insert(input, Set::new_unchecked(&buffer))?; - } - } - - let (map, doc_indexes) = builder.into_inner()?; - PositiveBlob::from_bytes(map, doc_indexes) - }) - } -} diff --git a/src/database/blob/positive/blob.rs b/src/database/blob/positive/blob.rs deleted file mode 100644 index 1a294a657..000000000 --- a/src/database/blob/positive/blob.rs +++ /dev/null @@ -1,275 +0,0 @@ -use std::io::{Write, Cursor, BufRead}; -use std::convert::From; -use std::error::Error; -use std::sync::Arc; -use std::fmt; - -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; -use fst::{map, Map, Streamer, IntoStreamer}; -use fst::raw::Fst; -use sdset::Set; - -use crate::DocIndex; -use crate::data::{DocIndexes, DocIndexesBuilder}; - -#[derive(Default)] -pub struct PositiveBlob { - map: Map, - indexes: DocIndexes, -} - -impl PositiveBlob { - pub fn from_bytes(map: Vec, indexes: Vec) -> Result> { - let map = Map::from_bytes(map)?; - let indexes = DocIndexes::from_bytes(indexes)?; - Ok(PositiveBlob { map, indexes }) - } - - pub fn from_raw(map: Map, indexes: DocIndexes) -> Self { - PositiveBlob { map, indexes } - } - - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> Result> { - let mut cursor = Cursor::new(&bytes.as_slice()[..len]); - cursor.consume(offset); - - let map_len = cursor.read_u64::()? as usize; - let offset = cursor.position() as usize; - let map = Map::from(Fst::from_shared_bytes(bytes.clone(), offset, map_len)?); - - cursor.consume(map_len); - - let doc_len = cursor.read_u64::()? as usize; - let offset = cursor.position() as usize; - let doc_indexes = DocIndexes::from_shared_bytes(bytes, offset, doc_len)?; - - Ok(PositiveBlob::from_raw(map, doc_indexes)) - } - - pub fn write_to_bytes(&self, bytes: &mut Vec) { - let map_bytes = self.map.as_fst().as_bytes(); - bytes.write_u64::(map_bytes.len() as u64).unwrap(); - bytes.extend_from_slice(&map_bytes); - - let doc_indexes_vec = self.indexes.to_vec(); // FIXME patch to have a as_slice() function - bytes.write_u64::(doc_indexes_vec.len() as u64).unwrap(); - bytes.extend_from_slice(&doc_indexes_vec); - } - - pub fn get>(&self, key: K) -> Option<&[DocIndex]> { - self.map.get(key).map(|index| &self.indexes[index as usize]) - } - - pub fn as_map(&self) -> &Map { - &self.map - } - - pub fn as_indexes(&self) -> &DocIndexes { - &self.indexes - } - - pub fn explode(self) -> (Map, DocIndexes) { - (self.map, self.indexes) - } -} - -impl fmt::Debug for PositiveBlob { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "PositiveBlob([")?; - let mut stream = self.into_stream(); - let mut first = true; - while let Some((k, v)) = stream.next() { - if !first { - write!(f, ", ")?; - } - first = false; - write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?; - } - write!(f, "])") - } -} - -impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob { - type Item = (&'a [u8], &'a [DocIndex]); - /// The type of the stream to be constructed. - type Into = PositiveBlobStream<'m>; - - /// Construct a stream from `Self`. - fn into_stream(self) -> Self::Into { - PositiveBlobStream { - map_stream: self.map.into_stream(), - doc_indexes: &self.indexes, - } - } -} - -pub struct PositiveBlobStream<'m> { - map_stream: map::Stream<'m>, - doc_indexes: &'m DocIndexes, -} - -impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> { - type Item = (&'a [u8], &'a [DocIndex]); - - fn next(&'a mut self) -> Option { - match self.map_stream.next() { - Some((input, index)) => { - let doc_indexes = &self.doc_indexes[index as usize]; - Some((input, doc_indexes)) - }, - None => None, - } - } -} - -pub struct PositiveBlobBuilder { - map: fst::MapBuilder, - indexes: DocIndexesBuilder, - value: u64, -} - -impl PositiveBlobBuilder, Vec> { - pub fn memory() -> Self { - PositiveBlobBuilder { - map: fst::MapBuilder::memory(), - indexes: DocIndexesBuilder::memory(), - value: 0, - } - } -} - -impl PositiveBlobBuilder { - pub fn new(map: W, indexes: X) -> Result> { - Ok(PositiveBlobBuilder { - map: fst::MapBuilder::new(map)?, - indexes: DocIndexesBuilder::new(indexes), - value: 0, - }) - } - - /// If a key is inserted that is less than or equal to any previous key added, - /// then an error is returned. Similarly, if there was a problem writing - /// to the underlying writer, an error is returned. - // FIXME what if one write doesn't work but the other do ? - pub fn insert(&mut self, key: K, doc_indexes: &Set) -> Result<(), Box> - where K: AsRef<[u8]>, - { - self.map.insert(key, self.value)?; - self.indexes.insert(doc_indexes)?; - self.value += 1; - Ok(()) - } - - pub fn finish(self) -> Result<(), Box> { - self.into_inner().map(drop) - } - - pub fn into_inner(self) -> Result<(W, X), Box> { - let map = self.map.into_inner()?; - let indexes = self.indexes.into_inner()?; - Ok((map, indexes)) - } -} - -#[cfg(test)] -mod tests { - use super::*; - - use std::error::Error; - use crate::{Attribute, WordArea}; - - use crate::DocumentId; - - #[test] - fn create_query() -> Result<(), Box> { - let a = DocIndex { document_id: DocumentId(0), attribute: Attribute::new(3, 11), word_area: WordArea::new(30, 4) }; - let b = DocIndex { document_id: DocumentId(1), attribute: Attribute::new(4, 21), word_area: WordArea::new(35, 6) }; - let c = DocIndex { document_id: DocumentId(2), attribute: Attribute::new(8, 2), word_area: WordArea::new(89, 6) }; - - let mut builder = PositiveBlobBuilder::memory(); - - builder.insert("aaa", Set::new(&[a])?)?; - builder.insert("aab", Set::new(&[a, b, c])?)?; - builder.insert("aac", Set::new(&[a, c])?)?; - - let (map_bytes, indexes_bytes) = builder.into_inner()?; - let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?; - - assert_eq!(positive_blob.get("aaa"), Some(&[a][..])); - assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..])); - assert_eq!(positive_blob.get("aac"), Some(&[a, c][..])); - assert_eq!(positive_blob.get("aad"), None); - - Ok(()) - } - - #[test] - fn serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { - document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) - }; - let b = DocIndex { - document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) - }; - let c = DocIndex { - document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) - }; - - let mut builder = PositiveBlobBuilder::memory(); - - builder.insert("aaa", Set::new(&[a])?)?; - builder.insert("aab", Set::new(&[a, b, c])?)?; - builder.insert("aac", Set::new(&[a, c])?)?; - - let (map_bytes, indexes_bytes) = builder.into_inner()?; - let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?; - - assert_eq!(positive_blob.get("aaa"), Some(&[a][..])); - assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..])); - assert_eq!(positive_blob.get("aac"), Some(&[a, c][..])); - assert_eq!(positive_blob.get("aad"), None); - - Ok(()) - } - - #[test] - fn serde_serialize_deserialize() -> Result<(), Box> { - let a = DocIndex { - document_id: DocumentId(0), - attribute: Attribute::new_faillible(3, 11), - word_area: WordArea::new_faillible(30, 4) - }; - let b = DocIndex { - document_id: DocumentId(1), - attribute: Attribute::new_faillible(4, 21), - word_area: WordArea::new_faillible(35, 6) - }; - let c = DocIndex { - document_id: DocumentId(2), - attribute: Attribute::new_faillible(8, 2), - word_area: WordArea::new_faillible(89, 6) - }; - - let mut builder = PositiveBlobBuilder::memory(); - - builder.insert("aaa", Set::new(&[a])?)?; - builder.insert("aab", Set::new(&[a, b, c])?)?; - builder.insert("aac", Set::new(&[a, c])?)?; - - let (map_bytes, indexes_bytes) = builder.into_inner()?; - let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?; - - assert_eq!(positive_blob.get("aaa"), Some(&[a][..])); - assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..])); - assert_eq!(positive_blob.get("aac"), Some(&[a, c][..])); - assert_eq!(positive_blob.get("aad"), None); - - Ok(()) - } -} diff --git a/src/database/blob/positive/mod.rs b/src/database/blob/positive/mod.rs deleted file mode 100644 index d8e4e164e..000000000 --- a/src/database/blob/positive/mod.rs +++ /dev/null @@ -1,5 +0,0 @@ -mod blob; -mod ops; - -pub use self::blob::{PositiveBlob, PositiveBlobBuilder}; -pub use self::ops::OpBuilder; diff --git a/src/database/blob/positive/ops.rs b/src/database/blob/positive/ops.rs deleted file mode 100644 index ffd66cead..000000000 --- a/src/database/blob/positive/ops.rs +++ /dev/null @@ -1,128 +0,0 @@ -use sdset::multi::OpBuilder as SdOpBuilder; -use sdset::{SetOperation, Set}; - -use crate::database::blob::PositiveBlob; -use crate::data::DocIndexes; -use crate::DocIndex; - -pub struct OpBuilder<'m> { - // the operation on the maps is always an union. - map_op: fst::map::OpBuilder<'m>, - indexes: Vec<&'m DocIndexes>, -} - -/// Do a set operation on multiple positive blobs. -impl<'m> OpBuilder<'m> { - pub fn new() -> Self { - Self { - map_op: fst::map::OpBuilder::new(), - indexes: Vec::new(), - } - } - - pub fn with_capacity(cap: usize) -> Self { - Self { - map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity - indexes: Vec::with_capacity(cap), - } - } - - pub fn add(mut self, blob: &'m PositiveBlob) -> Self { - self.push(blob); - self - } - - pub fn push(&mut self, blob: &'m PositiveBlob) { - self.map_op.push(blob.as_map()); - self.indexes.push(blob.as_indexes()); - } - - pub fn union(self) -> Union<'m> { - Union::new(self.map_op.union(), self.indexes) - } - - pub fn intersection(self) -> Intersection<'m> { - Intersection::new(self.map_op.union(), self.indexes) - } - - pub fn difference(self) -> Difference<'m> { - Difference::new(self.map_op.union(), self.indexes) - } - - pub fn symmetric_difference(self) -> SymmetricDifference<'m> { - SymmetricDifference::new(self.map_op.union(), self.indexes) - } -} - -macro_rules! logical_operation { - (struct $name:ident, $operation:ident) => { - -pub struct $name<'m> { - stream: fst::map::Union<'m>, - indexes: Vec<&'m DocIndexes>, - outs: Vec, -} - -impl<'m> $name<'m> { - fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self { - $name { - stream: stream, - indexes: indexes, - outs: Vec::new(), - } - } -} - -impl<'m, 'a> fst::Streamer<'a> for $name<'m> { - type Item = (&'a [u8], &'a Set); - - fn next(&'a mut self) -> Option { - // loop { - // let (input, ivalues) = match self.stream.next() { - // Some(value) => value, - // None => return None, - // }; - - // self.outs.clear(); - - // let mut builder = SdOpBuilder::with_capacity(ivalues.len()); - // for ivalue in ivalues { - // let indexes = self.indexes[ivalue.index]; - // let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes"); - // let set = Set::new_unchecked(indexes); - // builder.push(set); - // } - - // builder.$operation().extend_vec(&mut self.outs); - - // if self.outs.is_empty() { continue } - // return Some((input, &self.outs)) - // } - - // FIXME make the above code compile - match self.stream.next() { - Some((input, ivalues)) => { - self.outs.clear(); - - let mut builder = SdOpBuilder::with_capacity(ivalues.len()); - for ivalue in ivalues { - let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize]; - let set = Set::new_unchecked(doc_indexes); - builder.push(set); - } - - builder.$operation().extend_vec(&mut self.outs); - - if self.outs.is_empty() { return None } - return Some((input, Set::new_unchecked(&self.outs))) - }, - None => None - } - } -} -}} - -logical_operation!(struct Union, union); -logical_operation!(struct Intersection, intersection); -logical_operation!(struct Difference, difference); -logical_operation!(struct SymmetricDifference, symmetric_difference); diff --git a/src/database/database.rs b/src/database/database.rs index 37d95af1c..9b3d76f15 100644 --- a/src/database/database.rs +++ b/src/database/database.rs @@ -7,7 +7,7 @@ use rocksdb::rocksdb::{Writable, Snapshot}; use rocksdb::{DB, DBVector, MergeOperands}; use crossbeam::atomic::ArcCell; -use crate::database::blob::{self, Blob, PositiveBlob}; +use crate::database::index::{self, Index, Positive}; use crate::database::{DatabaseView, Update, Schema}; use crate::database::{DATA_INDEX, DATA_SCHEMA}; @@ -85,12 +85,9 @@ impl Database { Err(e) => return Err(e.to_string().into()), }; - let move_update = update.can_be_moved(); - let path = update.into_path_buf(); - let path = path.to_string_lossy(); - + let path = update.path().to_string_lossy(); let mut options = IngestExternalFileOptions::new(); - options.move_files(move_update); + // options.move_files(move_update); let cf_handle = db.cf_handle("default").expect("\"default\" column family not found"); db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?; @@ -124,42 +121,28 @@ impl Database { } } -fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - if key != DATA_INDEX { - panic!("The merge operator only supports \"data-index\" merging") - } +fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { + assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging"); - let capacity = { - let remaining = operands.size_hint().0; - let already_exist = usize::from(existing_value.is_some()); - remaining + already_exist - }; + let mut index: Option = None; - let mut op = blob::OpBuilder::with_capacity(capacity); - if let Some(bytes) = existing_value { + for bytes in existing.into_iter().chain(operands) { let bytes_len = bytes.len(); let bytes = Arc::new(bytes.to_vec()); - let blob = match PositiveBlob::from_shared_bytes(bytes, 0, bytes_len) { - Ok(blob) => blob, - Err(e) => panic!("BUG: could not deserialize data-index due to {}", e), + let operand = Index::from_shared_bytes(bytes, 0, bytes_len); + let operand = operand.expect("BUG: could not deserialize index"); + + let merged = match index { + Some(ref index) => index.merge(&operand).expect("BUG: could not merge index"), + None => operand, }; - op.push(Blob::Positive(blob)); + + index.replace(merged); } - for bytes in operands { - let bytes_len = bytes.len(); - let bytes = Arc::new(bytes.to_vec()); - let blob = match Blob::from_shared_bytes(bytes, 0, bytes_len) { - Ok(blob) => blob, - Err(e) => panic!("BUG: could not deserialize blob due to {}", e), - }; - op.push(blob); - } - - let blob = op.merge().expect("BUG: could not merge blobs"); - + let index = index.unwrap_or_default(); let mut bytes = Vec::new(); - blob.write_to_bytes(&mut bytes); + index.write_to_bytes(&mut bytes); bytes } @@ -172,7 +155,7 @@ mod tests { use tempfile::tempdir; use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; - use crate::database::update::PositiveUpdateBuilder; + use crate::database::update::UpdateBuilder; use crate::tokenizer::DefaultBuilder; #[test] @@ -219,15 +202,14 @@ mod tests { let docid0; let docid1; let mut update = { - let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder); + let mut builder = UpdateBuilder::new(update_path, schema); - docid0 = builder.update(&doc0).unwrap(); - docid1 = builder.update(&doc1).unwrap(); + docid0 = builder.update_document(&doc0).unwrap(); + docid1 = builder.update_document(&doc1).unwrap(); builder.build()? }; - update.set_move(true); database.ingest_update_file(update)?; let view = database.view(); diff --git a/src/database/database_view.rs b/src/database/database_view.rs index b37d84042..b5d40400b 100644 --- a/src/database/database_view.rs +++ b/src/database/database_view.rs @@ -9,9 +9,9 @@ use serde::de::DeserializeOwned; use crate::database::{DocumentKey, DocumentKeyAttr}; use crate::database::{retrieve_data_schema, retrieve_data_index}; -use crate::database::blob::positive::PositiveBlob; use crate::database::deserializer::Deserializer; use crate::database::schema::Schema; +use crate::database::index::Index; use crate::rank::{QueryBuilder, FilterFunc}; use crate::DocumentId; @@ -19,7 +19,7 @@ pub struct DatabaseView where D: Deref { snapshot: Snapshot, - blob: PositiveBlob, + index: Index, schema: Schema, } @@ -28,16 +28,16 @@ where D: Deref { pub fn new(snapshot: Snapshot) -> Result, Box> { let schema = retrieve_data_schema(&snapshot)?; - let blob = retrieve_data_index(&snapshot)?; - Ok(DatabaseView { snapshot, blob, schema }) + let index = retrieve_data_index(&snapshot)?; + Ok(DatabaseView { snapshot, index, schema }) } pub fn schema(&self) -> &Schema { &self.schema } - pub fn blob(&self) -> &PositiveBlob { - &self.blob + pub fn index(&self) -> &Index { + &self.index } pub fn into_snapshot(self) -> Snapshot { diff --git a/src/database/index/mod.rs b/src/database/index/mod.rs new file mode 100644 index 000000000..0098c5fd2 --- /dev/null +++ b/src/database/index/mod.rs @@ -0,0 +1,81 @@ +mod negative; +mod positive; + +pub(crate) use self::negative::Negative; +pub(crate) use self::positive::{Positive, PositiveBuilder}; + +use std::sync::Arc; +use std::error::Error; +use std::io::{Cursor, BufRead}; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use fst::{IntoStreamer, Streamer}; +use sdset::duo::DifferenceByKey; +use sdset::{Set, SetOperation}; +use fst::raw::Fst; +use fst::Map; + +use crate::data::{DocIds, DocIndexes}; + +#[derive(Default)] +pub struct Index { + pub(crate) negative: Negative, + pub(crate) positive: Positive, +} + +impl Index { + pub fn from_bytes(bytes: Vec) -> Result> { + let len = bytes.len(); + Index::from_shared_bytes(Arc::new(bytes), 0, len) + } + + pub fn from_shared_bytes( + bytes: Arc>, + offset: usize, + len: usize, + ) -> Result> + { + let (negative, neg_offset) = Negative::from_shared_bytes(bytes.clone(), offset, len)?; + let (positive, _) = Positive::from_shared_bytes(bytes, offset + neg_offset, len)?; + Ok(Index { negative, positive }) + } + + pub fn write_to_bytes(&self, bytes: &mut Vec) { + self.negative.write_to_bytes(bytes); + self.positive.write_to_bytes(bytes); + } + + pub fn merge(&self, other: &Index) -> Result> { + if other.negative.is_empty() { + let negative = Negative::default(); + let positive = self.positive.union(&other.positive)?; + return Ok(Index { negative, positive }) + } + + let mut buffer = Vec::new(); + let mut builder = PositiveBuilder::memory(); + let mut stream = self.positive.into_stream(); + while let Some((key, indexes)) = stream.next() { + let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x); + + buffer.clear(); + op.extend_vec(&mut buffer); + + if !buffer.is_empty() { + let indexes = Set::new_unchecked(&buffer); + builder.insert(key, indexes)?; + } + } + + let positive = { + let (map, indexes) = builder.into_inner()?; + let map = Map::from_bytes(map)?; + let indexes = DocIndexes::from_bytes(indexes)?; + Positive { map, indexes } + }; + + let negative = Negative::default(); + let positive = positive.union(&other.positive)?; + Ok(Index { negative, positive }) + } +} diff --git a/src/database/index/negative.rs b/src/database/index/negative.rs new file mode 100644 index 000000000..e9c30abfc --- /dev/null +++ b/src/database/index/negative.rs @@ -0,0 +1,53 @@ +use std::io::{Cursor, BufRead}; +use std::error::Error; +use std::mem::size_of; +use std::ops::Deref; +use std::sync::Arc; + +use sdset::Set; +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; + +use crate::data::DocIds; +use crate::DocumentId; + +#[derive(Default)] +pub struct Negative { + pub doc_ids: DocIds, +} + +impl Negative { + pub fn from_shared_bytes( + bytes: Arc>, + offset: usize, + len: usize, + ) -> Result<(Negative, usize), Box> + { + let mut cursor = Cursor::new(&bytes[..len]); + cursor.consume(offset); + + let len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let doc_ids = DocIds::from_shared_bytes(bytes, offset, len)?; + + Ok((Negative { doc_ids }, offset + len)) + } + + pub fn write_to_bytes(&self, bytes: &mut Vec) { + let slice = self.doc_ids.as_bytes(); + let len = slice.len() as u64; + let _ = bytes.write_u64::(len); + bytes.extend_from_slice(slice); + } + + pub fn is_empty(&self) -> bool { + self.doc_ids.doc_ids().is_empty() + } +} + +impl Deref for Negative { + type Target = Set; + + fn deref(&self) -> &Self::Target { + self.doc_ids.doc_ids() + } +} diff --git a/src/database/index/positive.rs b/src/database/index/positive.rs new file mode 100644 index 000000000..f72cb94de --- /dev/null +++ b/src/database/index/positive.rs @@ -0,0 +1,172 @@ +use std::io::{Write, BufRead, Cursor}; +use std::mem::size_of; +use std::error::Error; +use std::sync::Arc; + +use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use fst::{map, Map, Streamer, IntoStreamer}; +use sdset::{Set, SetOperation}; +use sdset::duo::Union; +use fst::raw::Fst; + +use crate::data::{DocIndexes, DocIndexesBuilder}; +use crate::DocIndex; + +#[derive(Default)] +pub struct Positive { + pub map: Map, + pub indexes: DocIndexes, +} + +impl Positive { + pub fn from_shared_bytes( + bytes: Arc>, + offset: usize, + len: usize, + ) -> Result<(Positive, usize), Box> + { + let mut cursor = Cursor::new(&bytes[..len]); + cursor.consume(offset); + + let map_len = cursor.read_u64::()? as usize; + let map_offset = cursor.position() as usize; + let fst = Fst::from_shared_bytes(bytes.clone(), map_offset, map_len)?; + let map = Map::from(fst); + + cursor.consume(map_len); + let indexes_len = cursor.read_u64::()? as usize; + let indexes_offset = cursor.position() as usize; + let indexes = DocIndexes::from_shared_bytes(bytes, indexes_offset, indexes_len)?; + + let positive = Positive { map, indexes }; + let len = indexes_offset + indexes_len; + + Ok((positive, len)) + } + + pub fn write_to_bytes(&self, bytes: &mut Vec) { + // indexes + let slice = self.map.as_fst().as_bytes(); + let len = slice.len() as u64; + let _ = bytes.write_u64::(len); + bytes.extend_from_slice(slice); + + // map + self.indexes.write_to_bytes(bytes); + } + + pub fn union(&self, other: &Positive) -> Result> { + let mut builder = PositiveBuilder::memory(); + let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union(); + + let mut buffer = Vec::new(); + while let Some((key, ivalues)) = stream.next() { + buffer.clear(); + match ivalues { + [a, b] => { + let indexes = if a.index == 0 { &self.indexes } else { &other.indexes }; + let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?; + let a = Set::new_unchecked(indexes); + + let indexes = if b.index == 0 { &self.indexes } else { &other.indexes }; + let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?; + let b = Set::new_unchecked(indexes); + + let op = Union::new(a, b); + op.extend_vec(&mut buffer); + }, + [a] => { + let indexes = if a.index == 0 { &self.indexes } else { &other.indexes }; + let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?; + buffer.extend_from_slice(indexes) + }, + _ => continue, + } + + if !buffer.is_empty() { + let indexes = Set::new_unchecked(&buffer); + builder.insert(key, indexes)?; + } + } + + let (map, indexes) = builder.into_inner()?; + let map = Map::from_bytes(map)?; + let indexes = DocIndexes::from_bytes(indexes)?; + Ok(Positive { map, indexes }) + } +} + +impl<'m, 'a> IntoStreamer<'a> for &'m Positive { + type Item = (&'a [u8], &'a Set); + /// The type of the stream to be constructed. + type Into = Stream<'m>; + + /// Construct a stream from `Self`. + fn into_stream(self) -> Self::Into { + Stream { + map_stream: self.map.into_stream(), + indexes: &self.indexes, + } + } +} + +pub struct Stream<'m> { + map_stream: map::Stream<'m>, + indexes: &'m DocIndexes, +} + +impl<'m, 'a> Streamer<'a> for Stream<'m> { + type Item = (&'a [u8], &'a Set); + + fn next(&'a mut self) -> Option { + match self.map_stream.next() { + Some((input, index)) => { + let indexes = &self.indexes[index as usize]; + let indexes = Set::new_unchecked(indexes); + Some((input, indexes)) + }, + None => None, + } + } +} + +pub struct PositiveBuilder { + map: fst::MapBuilder, + indexes: DocIndexesBuilder, + value: u64, +} + +impl PositiveBuilder, Vec> { + pub fn memory() -> Self { + PositiveBuilder { + map: fst::MapBuilder::memory(), + indexes: DocIndexesBuilder::memory(), + value: 0, + } + } +} + +impl PositiveBuilder { + /// If a key is inserted that is less than or equal to any previous key added, + /// then an error is returned. Similarly, if there was a problem writing + /// to the underlying writer, an error is returned. + // FIXME what if one write doesn't work but the other do ? + pub fn insert(&mut self, key: K, indexes: &Set) -> Result<(), Box> + where K: AsRef<[u8]>, + { + self.map.insert(key, self.value)?; + self.indexes.insert(indexes)?; + self.value += 1; + Ok(()) + } + + pub fn finish(self) -> Result<(), Box> { + self.into_inner().map(drop) + } + + pub fn into_inner(self) -> Result<(W, X), Box> { + let map = self.map.into_inner()?; + let indexes = self.indexes.into_inner()?; + Ok((map, indexes)) + } +} diff --git a/src/database/mod.rs b/src/database/mod.rs index 6a776d239..4ff34709b 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -6,15 +6,12 @@ use std::sync::Arc; use rocksdb::rocksdb::{DB, Snapshot}; -pub use self::update::{ - Update, PositiveUpdateBuilder, NewState, - SerializerError, NegativeUpdateBuilder -}; +pub use self::index::Index; +pub use self::update::{Update, UpdateBuilder}; pub use self::document_key::{DocumentKey, DocumentKeyAttr}; pub use self::database_view::{DatabaseView, DocumentIter}; pub use self::database::Database; pub use self::schema::Schema; -use self::blob::positive::PositiveBlob; const DATA_INDEX: &[u8] = b"data-index"; const DATA_SCHEMA: &[u8] = b"data-schema"; @@ -29,8 +26,8 @@ macro_rules! forward_to_unserializable_type { } } -pub mod blob; pub mod schema; +pub(crate) mod index; mod update; mod database; mod document_key; @@ -52,15 +49,15 @@ where D: Deref } } -fn retrieve_data_index(snapshot: &Snapshot) -> Result> +fn retrieve_data_index(snapshot: &Snapshot) -> Result> where D: Deref { match snapshot.get(DATA_INDEX)? { Some(vector) => { let bytes_len = vector.as_ref().len(); let bytes = Arc::new(vector.as_ref().to_vec()); - Ok(PositiveBlob::from_shared_bytes(bytes, 0, bytes_len)?) + Ok(Index::from_shared_bytes(bytes, 0, bytes_len)?) }, - None => Ok(PositiveBlob::default()), + None => Ok(Index::default()), } } diff --git a/src/database/schema.rs b/src/database/schema.rs index 8d0db7110..bffd0aaa5 100644 --- a/src/database/schema.rs +++ b/src/database/schema.rs @@ -1,4 +1,3 @@ -use crate::database::update::SerializerError; use std::collections::{HashMap, BTreeMap}; use crate::database::calculate_hash; use std::io::{Read, Write}; @@ -141,13 +140,10 @@ impl Schema { attributes } - pub fn document_id(&self, document: &T) -> Result + pub fn document_id(&self, document: &T) -> Result> where T: Serialize, { - let find_document_id = FindDocumentIdSerializer { - id_attribute_name: self.identifier_name(), - }; - document.serialize(find_document_id) + unimplemented!() } pub fn props(&self, attr: SchemaAttr) -> SchemaProps { @@ -188,192 +184,6 @@ impl fmt::Display for SchemaAttr { } } -struct FindDocumentIdSerializer<'a> { - id_attribute_name: &'a str, -} - -impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = FindDocumentIdStructSerializer<'a>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - // Ok(MapSerializer { - // schema: self.schema, - // document_id: self.document_id, - // new_states: self.new_states, - // }) - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Ok(FindDocumentIdStructSerializer { - id_attribute_name: self.id_attribute_name, - document_id: None, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -struct FindDocumentIdStructSerializer<'a> { - id_attribute_name: &'a str, - document_id: Option, -} - -impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { - type Ok = DocumentId; - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T - ) -> Result<(), Self::Error> - where T: Serialize, - { - if self.id_attribute_name == key { - // TODO can it be possible to have multiple ids? - let id = bincode::serialize(value).unwrap(); - let hash = calculate_hash(&id); - self.document_id = Some(DocumentId(hash)); - } - - Ok(()) - } - - fn end(self) -> Result { - match self.document_id { - Some(document_id) => Ok(document_id), - None => Err(SerializerError::DocumentIdNotFound) - } - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/src/database/update/builder.rs b/src/database/update/builder.rs new file mode 100644 index 000000000..c3bdf59fc --- /dev/null +++ b/src/database/update/builder.rs @@ -0,0 +1,95 @@ +use std::collections::{BTreeMap, BTreeSet}; +use std::path::PathBuf; +use std::error::Error; + +use fst::map::{Map, MapBuilder}; +use rocksdb::rocksdb_options; +use serde::Serialize; +use sdset::Set; + +use crate::database::index::{Index, Positive, PositiveBuilder, Negative}; +use crate::database::{DATA_INDEX, Schema, DocumentKeyAttr}; +use crate::data::{DocIds, DocIndexes}; +use crate::{DocumentId, DocIndex}; +use super::Update; + +type Token = Vec; // TODO could be replaced by a SmallVec +type Value = Vec; + +pub struct UpdateBuilder { + sst_file: PathBuf, + schema: Schema, + removed_documents: BTreeSet, + words_indexes: BTreeMap>, + keys_values: BTreeMap, +} + +impl UpdateBuilder { + pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder { + UpdateBuilder { + sst_file: path, + schema: schema, + removed_documents: BTreeSet::new(), + words_indexes: BTreeMap::new(), + keys_values: BTreeMap::new(), + } + } + + pub fn update_document(&mut self, document: T) -> Result> + where T: Serialize, + { + unimplemented!() + } + + pub fn remove_document(&mut self, document: T) -> Result> + where T: Serialize, + { + unimplemented!() + } + + pub fn build(self) -> Result> { + let tree = { + let negative = { + let documents_ids = self.removed_documents.into_iter().collect(); + let doc_ids = DocIds::from_raw(documents_ids); + Negative { doc_ids } + }; + + let positive = { + let mut builder = PositiveBuilder::memory(); + + for (key, mut indexes) in self.words_indexes { + indexes.sort_unstable(); + let indexes = Set::new_unchecked(&indexes); + builder.insert(key, indexes); + } + + let (map, indexes) = builder.into_inner()?; + let map = Map::from_bytes(map)?; + let indexes = DocIndexes::from_bytes(indexes)?; + Positive { map, indexes } + }; + + Index { negative, positive } + }; + + let env_options = rocksdb_options::EnvOptions::new(); + let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); + let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); + file_writer.open(&self.sst_file.to_string_lossy())?; + + // write the data-index + let mut bytes = Vec::new(); + tree.write_to_bytes(&mut bytes); + file_writer.merge(DATA_INDEX, &bytes)?; + + // write all the documents attributes updates + for (key, value) in self.keys_values { + file_writer.put(key.as_ref(), &value)?; + } + + file_writer.finish()?; + + Ok(Update { sst_file: self.sst_file }) + } +} diff --git a/src/database/update/mod.rs b/src/database/update/mod.rs index 433624022..7bdda9949 100644 --- a/src/database/update/mod.rs +++ b/src/database/update/mod.rs @@ -1,35 +1,15 @@ -use std::path::PathBuf; -use std::error::Error; +use std::path::{Path, PathBuf}; -mod negative; -mod positive; +mod builder; -pub use self::positive::{PositiveUpdateBuilder, NewState, SerializerError}; -pub use self::negative::NegativeUpdateBuilder; +pub use self::builder::UpdateBuilder; pub struct Update { - path: PathBuf, - can_be_moved: bool, + sst_file: PathBuf, } impl Update { - pub fn open>(path: P) -> Result> { - Ok(Update { path: path.into(), can_be_moved: false }) - } - - pub fn open_and_move>(path: P) -> Result> { - Ok(Update { path: path.into(), can_be_moved: true }) - } - - pub fn set_move(&mut self, can_be_moved: bool) { - self.can_be_moved = can_be_moved - } - - pub fn can_be_moved(&self) -> bool { - self.can_be_moved - } - - pub fn into_path_buf(self) -> PathBuf { - self.path + pub fn path(&self) -> &Path { + &self.sst_file } } diff --git a/src/database/update/negative/mod.rs b/src/database/update/negative/mod.rs deleted file mode 100644 index bad19c918..000000000 --- a/src/database/update/negative/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -mod update; -mod unordered_builder; - -pub use self::update::NegativeUpdateBuilder; diff --git a/src/database/update/negative/unordered_builder.rs b/src/database/update/negative/unordered_builder.rs deleted file mode 100644 index 4278e6974..000000000 --- a/src/database/update/negative/unordered_builder.rs +++ /dev/null @@ -1,37 +0,0 @@ -use std::collections::BTreeSet; -use std::io; - -use byteorder::{NativeEndian, WriteBytesExt}; - -use crate::DocumentId; - -pub struct UnorderedNegativeBlobBuilder { - doc_ids: BTreeSet, // TODO: prefer a linked-list - wrt: W, -} - -impl UnorderedNegativeBlobBuilder> { - pub fn memory() -> Self { - UnorderedNegativeBlobBuilder::new(Vec::new()) - } -} - -impl UnorderedNegativeBlobBuilder { - pub fn new(wrt: W) -> Self { - Self { - doc_ids: BTreeSet::new(), - wrt: wrt, - } - } - - pub fn insert(&mut self, doc: DocumentId) -> bool { - self.doc_ids.insert(doc) - } - - pub fn into_inner(mut self) -> io::Result { - for id in self.doc_ids { - self.wrt.write_u64::(id.0)?; - } - Ok(self.wrt) - } -} diff --git a/src/database/update/negative/update.rs b/src/database/update/negative/update.rs deleted file mode 100644 index 4b0c83784..000000000 --- a/src/database/update/negative/update.rs +++ /dev/null @@ -1,61 +0,0 @@ -use std::path::PathBuf; -use std::error::Error; - -use ::rocksdb::rocksdb_options; - -use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder; -use crate::database::blob::{Blob, NegativeBlob}; -use crate::database::update::Update; -use crate::database::DocumentKey; -use crate::database::DATA_INDEX; -use crate::DocumentId; - -pub struct NegativeUpdateBuilder { - path: PathBuf, - doc_ids: UnorderedNegativeBlobBuilder>, -} - -impl NegativeUpdateBuilder { - pub fn new>(path: P) -> NegativeUpdateBuilder { - NegativeUpdateBuilder { - path: path.into(), - doc_ids: UnorderedNegativeBlobBuilder::memory(), - } - } - - pub fn remove(&mut self, id: DocumentId) -> bool { - self.doc_ids.insert(id) - } - - pub fn build(self) -> Result> { - let env_options = rocksdb_options::EnvOptions::new(); - let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); - let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); - file_writer.open(&self.path.to_string_lossy())?; - - let bytes = self.doc_ids.into_inner()?; - let negative_blob = NegativeBlob::from_bytes(bytes)?; - let blob = Blob::Negative(negative_blob); - - // write the data-index aka negative blob - let mut bytes = Vec::new(); - blob.write_to_bytes(&mut bytes); - file_writer.merge(DATA_INDEX, &bytes)?; - - // FIXME remove this ugly thing ! - // let Blob::Negative(negative_blob) = blob; - let negative_blob = match blob { - Blob::Negative(blob) => blob, - Blob::Positive(_) => unreachable!(), - }; - - for &document_id in negative_blob.as_ref().as_slice() { - let start = DocumentKey::new(document_id); - let end = start.with_attribute_max(); - file_writer.delete_range(start.as_ref(), end.as_ref())?; - } - - file_writer.finish()?; - Update::open(self.path) - } -} diff --git a/src/database/update/positive/mod.rs b/src/database/update/positive/mod.rs deleted file mode 100644 index 414f88722..000000000 --- a/src/database/update/positive/mod.rs +++ /dev/null @@ -1,4 +0,0 @@ -mod update; -mod unordered_builder; - -pub use self::update::{PositiveUpdateBuilder, NewState, SerializerError}; diff --git a/src/database/update/positive/unordered_builder.rs b/src/database/update/positive/unordered_builder.rs deleted file mode 100644 index 6b9dc5abe..000000000 --- a/src/database/update/positive/unordered_builder.rs +++ /dev/null @@ -1,49 +0,0 @@ -#![allow(unused)] - -use std::collections::BTreeMap; -use std::error::Error; -use std::io::Write; - -use sdset::Set; - -use crate::database::blob::positive::PositiveBlobBuilder; -use crate::DocIndex; - -pub struct UnorderedPositiveBlobBuilder { - builder: PositiveBlobBuilder, - map: BTreeMap, Vec>, -} - -impl UnorderedPositiveBlobBuilder, Vec> { - pub fn memory() -> Self { - Self { - builder: PositiveBlobBuilder::memory(), - map: BTreeMap::new(), - } - } -} - -impl UnorderedPositiveBlobBuilder { - pub fn new(map_wtr: W, doc_wtr: X) -> Result> { - Ok(UnorderedPositiveBlobBuilder { - builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?, - map: BTreeMap::new(), - }) - } - - pub fn insert>>(&mut self, input: K, doc_index: DocIndex) { - self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index); - } - - pub fn finish(self) -> Result<(), Box> { - self.into_inner().map(drop) - } - - pub fn into_inner(mut self) -> Result<(W, X), Box> { - for (key, mut doc_indexes) in self.map { - doc_indexes.sort_unstable(); - self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?; - } - self.builder.into_inner() - } -} diff --git a/src/database/update/positive/update.rs b/src/database/update/positive/update.rs deleted file mode 100644 index 6316a2e7f..000000000 --- a/src/database/update/positive/update.rs +++ /dev/null @@ -1,505 +0,0 @@ -use std::collections::BTreeMap; -use std::path::PathBuf; -use std::error::Error; -use std::fmt; - -use ::rocksdb::rocksdb_options; -use serde::ser::{self, Serialize}; - -use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder; -use crate::database::blob::positive::PositiveBlob; -use crate::database::schema::{Schema, SchemaAttr}; -use crate::tokenizer::{TokenizerBuilder, Token}; -use crate::database::DocumentKeyAttr; -use crate::database::update::Update; -use crate::database::DATA_INDEX; -use crate::database::blob::Blob; -use crate::{DocumentId, DocIndex, Attribute, WordArea}; - -pub enum NewState { - Updated { value: Vec }, - Removed, -} - -pub struct PositiveUpdateBuilder { - path: PathBuf, - schema: Schema, - tokenizer_builder: B, - builder: UnorderedPositiveBlobBuilder, Vec>, - new_states: BTreeMap, -} - -impl PositiveUpdateBuilder { - pub fn new>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder { - PositiveUpdateBuilder { - path: path.into(), - schema: schema, - tokenizer_builder: tokenizer_builder, - builder: UnorderedPositiveBlobBuilder::memory(), - new_states: BTreeMap::new(), - } - } - - pub fn update(&mut self, document: &T) -> Result - where B: TokenizerBuilder - { - let document_id = self.schema.document_id(document)?; - - let serializer = Serializer { - schema: &self.schema, - tokenizer_builder: &self.tokenizer_builder, - document_id: document_id, - builder: &mut self.builder, - new_states: &mut self.new_states - }; - document.serialize(serializer)?; - - Ok(document_id) - } - - // TODO value must be a field that can be indexed - pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) { - let value = bincode::serialize(&value).unwrap(); - self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value }); - } - - pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) { - self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed); - } -} - -#[derive(Debug)] -pub enum SerializerError { - DocumentIdNotFound, - UnserializableType { name: &'static str }, - Custom(String), -} - -impl ser::Error for SerializerError { - fn custom(msg: T) -> Self { - SerializerError::Custom(msg.to_string()) - } -} - -impl fmt::Display for SerializerError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - SerializerError::DocumentIdNotFound => { - write!(f, "serialized document does not have an id according to the schema") - } - SerializerError::UnserializableType { name } => { - write!(f, "Only struct and map types are considered valid documents and - can be serialized, not {} types directly.", name) - }, - SerializerError::Custom(s) => f.write_str(&s), - } - } -} - -impl Error for SerializerError {} - -struct Serializer<'a, B> { - schema: &'a Schema, - tokenizer_builder: &'a B, - document_id: DocumentId, - builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, - new_states: &'a mut BTreeMap, -} - -impl<'a, B> ser::Serializer for Serializer<'a, B> -where B: TokenizerBuilder -{ - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = StructSerializer<'a, B>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - // Ok(MapSerializer { - // schema: self.schema, - // document_id: self.document_id, - // new_states: self.new_states, - // }) - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Ok(StructSerializer { - schema: self.schema, - tokenizer_builder: self.tokenizer_builder, - document_id: self.document_id, - builder: self.builder, - new_states: self.new_states, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -struct StructSerializer<'a, B> { - schema: &'a Schema, - tokenizer_builder: &'a B, - document_id: DocumentId, - builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, - new_states: &'a mut BTreeMap, -} - -impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B> -where B: TokenizerBuilder -{ - type Ok = (); - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T - ) -> Result<(), Self::Error> - where T: Serialize, - { - if let Some(attr) = self.schema.attribute(key) { - let props = self.schema.props(attr); - if props.is_stored() { - let value = bincode::serialize(value).unwrap(); - let key = DocumentKeyAttr::new(self.document_id, attr); - self.new_states.insert(key, NewState::Updated { value }); - } - if props.is_indexed() { - let serializer = IndexerSerializer { - builder: self.builder, - tokenizer_builder: self.tokenizer_builder, - document_id: self.document_id, - attribute: attr, - }; - value.serialize(serializer)?; - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(()) - } -} - -struct IndexerSerializer<'a, B> { - tokenizer_builder: &'a B, - builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, - document_id: DocumentId, - attribute: SchemaAttr, -} - -impl<'a, B> ser::Serializer for IndexerSerializer<'a, B> -where B: TokenizerBuilder -{ - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, v: &str) -> Result { - for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { - let doc_index = DocIndex { - document_id: self.document_id, - attribute: Attribute::new_faillible(self.attribute.0, word_index as u32), - word_area: WordArea::new_faillible(char_index as u32, word.len() as u16), - }; - - // insert the exact representation - let word_lower = word.to_lowercase(); - - // and the unidecoded lowercased version - let word_unidecoded = unidecode::unidecode(word).to_lowercase(); - if word_lower != word_unidecoded { - self.builder.insert(word_unidecoded, doc_index); - } - - self.builder.insert(word_lower, doc_index); - } - Ok(()) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "seq" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -impl PositiveUpdateBuilder { - pub fn build(self) -> Result> { - let env_options = rocksdb_options::EnvOptions::new(); - let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); - let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); - file_writer.open(&self.path.to_string_lossy())?; - - let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?; - let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?; - let blob = Blob::Positive(positive_blob); - - // write the data-index aka positive blob - let mut bytes = Vec::new(); - blob.write_to_bytes(&mut bytes); - file_writer.merge(DATA_INDEX, &bytes)?; - - // write all the documents fields updates - for (key, state) in self.new_states { - match state { - NewState::Updated { value } => { - file_writer.put(key.as_ref(), &value)? - }, - NewState::Removed => file_writer.delete(key.as_ref())?, - } - } - - file_writer.finish()?; - Update::open(self.path) - } -} diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index 39419024c..a5bbdf885 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -86,7 +86,7 @@ where D: Deref, let mut stream = { let mut op_builder = fst::map::OpBuilder::new(); for automaton in &automatons { - let stream = self.view.blob().as_map().search(automaton); + let stream = self.view.index().positive.map.search(automaton); op_builder.push(stream); } op_builder.union() @@ -100,7 +100,7 @@ where D: Deref, let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); - let doc_indexes = self.view.blob().as_indexes(); + let doc_indexes = &self.view.index().positive.indexes; let doc_indexes = &doc_indexes[iv.value as usize]; for doc_index in doc_indexes { From c022fa3fcaf0be84e47ed0d6f7eac4e7ec4649cf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Sat, 29 Dec 2018 14:59:02 +0100 Subject: [PATCH 3/5] chore: Move serde related structs to their module --- src/database/mod.rs | 24 +- src/database/schema.rs | 5 +- src/database/serde/find_id.rs | 243 +++++++++++++++++++ src/database/serde/indexer_serializer.rs | 188 +++++++++++++++ src/database/serde/key_to_string.rs | 146 ++++++++++++ src/database/serde/mod.rs | 57 +++++ src/database/serde/serializer.rs | 289 +++++++++++++++++++++++ 7 files changed, 930 insertions(+), 22 deletions(-) create mode 100644 src/database/serde/find_id.rs create mode 100644 src/database/serde/indexer_serializer.rs create mode 100644 src/database/serde/key_to_string.rs create mode 100644 src/database/serde/mod.rs create mode 100644 src/database/serde/serializer.rs diff --git a/src/database/mod.rs b/src/database/mod.rs index 4ff34709b..2830f934b 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -1,45 +1,29 @@ -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; use std::error::Error; use std::ops::Deref; use std::sync::Arc; use rocksdb::rocksdb::{DB, Snapshot}; -pub use self::index::Index; -pub use self::update::{Update, UpdateBuilder}; pub use self::document_key::{DocumentKey, DocumentKeyAttr}; pub use self::database_view::{DatabaseView, DocumentIter}; +pub use self::update::{Update, UpdateBuilder}; +pub use self::serde::SerializerError; pub use self::database::Database; pub use self::schema::Schema; +pub use self::index::Index; const DATA_INDEX: &[u8] = b"data-index"; const DATA_SCHEMA: &[u8] = b"data-schema"; -macro_rules! forward_to_unserializable_type { - ($($ty:ident => $se_method:ident,)*) => { - $( - fn $se_method(self, _v: $ty) -> Result { - Err(SerializerError::UnserializableType { name: "$ty" }) - } - )* - } -} - pub mod schema; pub(crate) mod index; mod update; +mod serde; mod database; mod document_key; mod database_view; mod deserializer; -fn calculate_hash(t: &T) -> u64 { - let mut s = DefaultHasher::new(); - t.hash(&mut s); - s.finish() -} - fn retrieve_data_schema(snapshot: &Snapshot) -> Result> where D: Deref { diff --git a/src/database/schema.rs b/src/database/schema.rs index bffd0aaa5..5f622e003 100644 --- a/src/database/schema.rs +++ b/src/database/schema.rs @@ -1,5 +1,4 @@ use std::collections::{HashMap, BTreeMap}; -use crate::database::calculate_hash; use std::io::{Read, Write}; use std::error::Error; use std::{fmt, u16}; @@ -7,9 +6,11 @@ use std::ops::BitOr; use std::sync::Arc; use serde_derive::{Serialize, Deserialize}; -use serde::ser::{self, Serialize}; use linked_hash_map::LinkedHashMap; +use serde::Serialize; +use crate::database::serde::find_id::FindDocumentIdSerializer; +use crate::database::serde::SerializerError; use crate::DocumentId; pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false }; diff --git a/src/database/serde/find_id.rs b/src/database/serde/find_id.rs new file mode 100644 index 000000000..98e2e8036 --- /dev/null +++ b/src/database/serde/find_id.rs @@ -0,0 +1,243 @@ +use serde::Serialize; +use serde::ser; + +use crate::database::serde::key_to_string::KeyToStringSerializer; +use crate::database::serde::{SerializerError, calculate_hash}; +use crate::DocumentId; + +pub struct FindDocumentIdSerializer<'a> { + pub id_attribute_name: &'a str, +} + +impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { + type Ok = DocumentId; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = FindDocumentIdMapSerializer<'a>; + type SerializeStruct = FindDocumentIdStructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, _v: &str) -> Result { + Err(SerializerError::UnserializableType { name: "str" }) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Ok(FindDocumentIdMapSerializer { + id_attribute_name: self.id_attribute_name, + document_id: None, + current_key_name: None, + }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Ok(FindDocumentIdStructSerializer { + id_attribute_name: self.id_attribute_name, + document_id: None, + }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} + +pub struct FindDocumentIdMapSerializer<'a> { + id_attribute_name: &'a str, + document_id: Option, + current_key_name: Option, +} + +impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> { + type Ok = DocumentId; + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = key.serialize(KeyToStringSerializer)?; + self.current_key_name = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = self.current_key_name.take().unwrap(); + self.serialize_entry(&key, value) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V + ) -> Result<(), Self::Error> + where K: Serialize, V: Serialize, + { + let key = key.serialize(KeyToStringSerializer)?; + + if self.id_attribute_name == key { + // TODO is it possible to have multiple ids? + let id = bincode::serialize(value).unwrap(); + let hash = calculate_hash(&id); + self.document_id = Some(DocumentId(hash)); + } + + Ok(()) + } + + fn end(self) -> Result { + match self.document_id { + Some(document_id) => Ok(document_id), + None => Err(SerializerError::DocumentIdNotFound) + } + } +} + +pub struct FindDocumentIdStructSerializer<'a> { + id_attribute_name: &'a str, + document_id: Option, +} + +impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { + type Ok = DocumentId; + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T + ) -> Result<(), Self::Error> + where T: Serialize, + { + if self.id_attribute_name == key { + // TODO can it be possible to have multiple ids? + let id = bincode::serialize(value).unwrap(); + let hash = calculate_hash(&id); + self.document_id = Some(DocumentId(hash)); + } + + Ok(()) + } + + fn end(self) -> Result { + match self.document_id { + Some(document_id) => Ok(document_id), + None => Err(SerializerError::DocumentIdNotFound) + } + } +} diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs new file mode 100644 index 000000000..bfb0118ed --- /dev/null +++ b/src/database/serde/indexer_serializer.rs @@ -0,0 +1,188 @@ +use crate::database::update::UnorderedPositiveBlobBuilder; +use crate::database::schema::SchemaAttr; +use crate::database::serde::SerializerError; +use crate::tokenizer::TokenizerBuilder; +use crate::tokenizer::Token; +use crate::{DocumentId, DocIndex, Attribute, WordArea}; + +use serde::Serialize; +use serde::ser; + +pub struct IndexerSerializer<'a, B> { + pub tokenizer_builder: &'a B, + pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, + pub document_id: DocumentId, + pub attribute: SchemaAttr, +} + +impl<'a, B> ser::Serializer for IndexerSerializer<'a, B> +where B: TokenizerBuilder +{ + type Ok = (); + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, v: &str) -> Result { + for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) { + // FIXME must u32::try_from instead + let attribute = match Attribute::new(self.attribute.0, word_index as u32) { + Ok(attribute) => attribute, + Err(_) => return Ok(()), + }; + + // FIXME must u16/u32::try_from instead + let word_area = match WordArea::new(char_index as u32, word.len() as u16) { + Ok(word_area) => word_area, + Err(_) => return Ok(()), + }; + + let doc_index = DocIndex { + document_id: self.document_id, + attribute, + word_area + }; + + // insert the exact representation + let word_lower = word.to_lowercase(); + + // and the unidecoded lowercased version + let word_unidecoded = unidecode::unidecode(word).to_lowercase(); + if word_lower != word_unidecoded { + self.builder.insert(word_unidecoded, doc_index); + } + + self.builder.insert(word_lower, doc_index); + } + Ok(()) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "seq" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} diff --git a/src/database/serde/key_to_string.rs b/src/database/serde/key_to_string.rs new file mode 100644 index 000000000..2fe0c5a39 --- /dev/null +++ b/src/database/serde/key_to_string.rs @@ -0,0 +1,146 @@ +use serde::Serialize; +use serde::ser; + +use crate::database::serde::SerializerError; + +pub struct KeyToStringSerializer; + +impl ser::Serializer for KeyToStringSerializer { + type Ok = String; + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = ser::Impossible; + type SerializeStruct = ser::Impossible; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, value: &str) -> Result { + Ok(value.to_string()) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "map" }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} diff --git a/src/database/serde/mod.rs b/src/database/serde/mod.rs new file mode 100644 index 000000000..248c5cf5f --- /dev/null +++ b/src/database/serde/mod.rs @@ -0,0 +1,57 @@ +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; +use std::error::Error; +use std::fmt; + +use serde::ser; + +macro_rules! forward_to_unserializable_type { + ($($ty:ident => $se_method:ident,)*) => { + $( + fn $se_method(self, _v: $ty) -> Result { + Err(SerializerError::UnserializableType { name: "$ty" }) + } + )* + } +} + +pub mod find_id; +pub mod key_to_string; +pub mod serializer; +pub mod indexer_serializer; + +pub fn calculate_hash(t: &T) -> u64 { + let mut s = DefaultHasher::new(); + t.hash(&mut s); + s.finish() +} + +#[derive(Debug)] +pub enum SerializerError { + DocumentIdNotFound, + UnserializableType { name: &'static str }, + Custom(String), +} + +impl ser::Error for SerializerError { + fn custom(msg: T) -> Self { + SerializerError::Custom(msg.to_string()) + } +} + +impl fmt::Display for SerializerError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + SerializerError::DocumentIdNotFound => { + write!(f, "serialized document does not have an id according to the schema") + } + SerializerError::UnserializableType { name } => { + write!(f, "Only struct and map types are considered valid documents and + can be serialized, not {} types directly.", name) + }, + SerializerError::Custom(s) => f.write_str(&s), + } + } +} + +impl Error for SerializerError {} diff --git a/src/database/serde/serializer.rs b/src/database/serde/serializer.rs new file mode 100644 index 000000000..d2faed2db --- /dev/null +++ b/src/database/serde/serializer.rs @@ -0,0 +1,289 @@ +use std::collections::BTreeMap; + +use serde::Serialize; +use serde::ser; + +use crate::database::serde::indexer_serializer::IndexerSerializer; +use crate::database::serde::key_to_string::KeyToStringSerializer; +use crate::database::update::UnorderedPositiveBlobBuilder; +use crate::database::document_key::DocumentKeyAttr; +use crate::database::update::NewState; +use crate::database::Schema; +use crate::database::serde::SerializerError; +use crate::tokenizer::TokenizerBuilder; +use crate::DocumentId; + +pub struct Serializer<'a, B> { + pub schema: &'a Schema, + pub tokenizer_builder: &'a B, + pub document_id: DocumentId, + pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, + pub new_states: &'a mut BTreeMap, +} + +impl<'a, B> ser::Serializer for Serializer<'a, B> +where B: TokenizerBuilder +{ + type Ok = (); + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = MapSerializer<'a, B>; + type SerializeStruct = StructSerializer<'a, B>; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, _v: &str) -> Result { + Err(SerializerError::UnserializableType { name: "str" }) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: Serialize, + { + Err(SerializerError::UnserializableType { name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Ok(MapSerializer { + schema: self.schema, + tokenizer_builder: self.tokenizer_builder, + document_id: self.document_id, + current_key_name: None, + builder: self.builder, + new_states: self.new_states, + }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Ok(StructSerializer { + schema: self.schema, + tokenizer_builder: self.tokenizer_builder, + document_id: self.document_id, + builder: self.builder, + new_states: self.new_states, + }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { name: "struct variant" }) + } +} + +pub struct MapSerializer<'a, B> { + pub schema: &'a Schema, + pub tokenizer_builder: &'a B, + pub document_id: DocumentId, + pub current_key_name: Option, + pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, + pub new_states: &'a mut BTreeMap, +} + +impl<'a, B> ser::SerializeMap for MapSerializer<'a, B> +where B: TokenizerBuilder +{ + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = key.serialize(KeyToStringSerializer)?; + self.current_key_name = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: Serialize, + { + let key = self.current_key_name.take().unwrap(); + self.serialize_entry(&key, value) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V + ) -> Result<(), Self::Error> + where K: Serialize, V: Serialize, + { + let key = key.serialize(KeyToStringSerializer)?; + + if let Some(attr) = self.schema.attribute(key) { + let props = self.schema.props(attr); + if props.is_stored() { + let value = bincode::serialize(value).unwrap(); + let key = DocumentKeyAttr::new(self.document_id, attr); + self.new_states.insert(key, NewState::Updated { value }); + } + if props.is_indexed() { + let serializer = IndexerSerializer { + builder: self.builder, + tokenizer_builder: self.tokenizer_builder, + document_id: self.document_id, + attribute: attr, + }; + value.serialize(serializer)?; + } + } + + Ok(()) + } + + fn end(self) -> Result { + Ok(()) + } +} + +pub struct StructSerializer<'a, B> { + pub schema: &'a Schema, + pub tokenizer_builder: &'a B, + pub document_id: DocumentId, + pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, + pub new_states: &'a mut BTreeMap, +} + +impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B> +where B: TokenizerBuilder +{ + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T + ) -> Result<(), Self::Error> + where T: Serialize, + { + if let Some(attr) = self.schema.attribute(key) { + let props = self.schema.props(attr); + if props.is_stored() { + let value = bincode::serialize(value).unwrap(); + let key = DocumentKeyAttr::new(self.document_id, attr); + self.new_states.insert(key, NewState::Updated { value }); + } + if props.is_indexed() { + let serializer = IndexerSerializer { + builder: self.builder, + tokenizer_builder: self.tokenizer_builder, + document_id: self.document_id, + attribute: attr, + }; + value.serialize(serializer)?; + } + } + + Ok(()) + } + + fn end(self) -> Result { + Ok(()) + } +} From 64d53ee1bd82ef6de1c06082749948ff40cb63ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 31 Dec 2018 18:33:59 +0100 Subject: [PATCH 4/5] chore: Rework the data module structures being able to be constructed from SharedData --- examples/create-database.rs | 4 +- src/data/doc_ids.rs | 63 +++++++-------- src/data/doc_indexes.rs | 94 +++++++++++------------ src/data/mod.rs | 38 +++++----- src/database/database.rs | 12 +-- src/database/index/mod.rs | 17 +++-- src/database/index/negative.rs | 36 ++++----- src/database/index/positive.rs | 54 ++++++------- src/database/schema.rs | 6 +- src/database/serde/indexer_serializer.rs | 8 +- src/database/serde/serializer.rs | 26 +++---- src/database/update/builder.rs | 97 ++++++++---------------- src/database/update/mod.rs | 2 + src/database/update/raw_builder.rs | 93 +++++++++++++++++++++++ src/rank/query_builder.rs | 4 +- 15 files changed, 292 insertions(+), 262 deletions(-) create mode 100644 src/database/update/raw_builder.rs diff --git a/examples/create-database.rs b/examples/create-database.rs index 5c6e81d1d..07ffeb931 100644 --- a/examples/create-database.rs +++ b/examples/create-database.rs @@ -59,10 +59,10 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result Self { - DocIds { data: SharedData::empty() } + pub fn new(ids: &Set) -> DocIds { + let bytes = unsafe { into_u8_slice(ids.as_slice()) }; + let data = SharedData::from_bytes(bytes.to_vec()); + DocIds(data) } - pub fn from_bytes(vec: Vec) -> io::Result { - let len = vec.len(); - DocIds::from_shared_bytes(Arc::new(vec), 0, len) - } + pub fn from_cursor(cursor: &mut Cursor) -> io::Result { + let len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let doc_ids = cursor.get_ref().range(offset, len); + cursor.consume(len); - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> io::Result { - let data = SharedData { bytes, offset, len }; - DocIds::from_data(data) - } - - pub fn as_bytes(&self) -> &[u8] { - &self.data - } - - fn from_data(data: SharedData) -> io::Result { - let len = data.as_ref().read_u64::()?; - let data = data.range(mem::size_of::(), len as usize); - Ok(DocIds { data }) - } - - pub fn from_raw(vec: Vec) -> Self { - DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap() + Ok(DocIds(doc_ids)) } pub fn write_to_bytes(&self, bytes: &mut Vec) { - let len = self.data.len() as u64; + let len = self.0.len() as u64; bytes.write_u64::(len).unwrap(); - bytes.extend_from_slice(&self.data); + bytes.extend_from_slice(&self.0); } - pub fn contains(&self, doc: DocumentId) -> bool { - // FIXME prefer using the sdset::exponential_search function - self.doc_ids().binary_search(&doc).is_ok() + pub fn is_empty(&self) -> bool { + self.0.is_empty() } - pub fn doc_ids(&self) -> &Set { - let slice = &self.data; + pub fn as_bytes(&self) -> &[u8] { + &self.0 + } +} + +impl AsRef> for DocIds { + fn as_ref(&self) -> &Set { + let slice = &self.0; let ptr = slice.as_ptr() as *const DocumentId; - let len = slice.len() / mem::size_of::(); + let len = slice.len() / size_of::(); let slice = unsafe { from_raw_parts(ptr, len) }; Set::new_unchecked(slice) } diff --git a/src/data/doc_indexes.rs b/src/data/doc_indexes.rs index 21627cb0d..b760765bf 100644 --- a/src/data/doc_indexes.rs +++ b/src/data/doc_indexes.rs @@ -1,5 +1,5 @@ +use std::io::{self, Write, Cursor, BufRead}; use std::slice::from_raw_parts; -use std::io::{self, Write}; use std::mem::size_of; use std::ops::Index; use std::sync::Arc; @@ -9,6 +9,7 @@ use sdset::Set; use crate::DocIndex; use crate::data::SharedData; +use super::into_u8_slice; #[derive(Debug)] #[repr(C)] @@ -24,40 +25,36 @@ pub struct DocIndexes { } impl DocIndexes { - pub fn from_bytes(vec: Vec) -> io::Result { - let len = vec.len(); - DocIndexes::from_shared_bytes(Arc::new(vec), 0, len) + pub fn from_bytes(bytes: Vec) -> io::Result { + let bytes = Arc::new(bytes); + let len = bytes.len(); + let data = SharedData::new(bytes, 0, len); + let mut cursor = Cursor::new(data); + DocIndexes::from_cursor(&mut cursor) } - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> io::Result { - let data = SharedData { bytes, offset, len }; - DocIndexes::from_data(data) - } + pub fn from_cursor(cursor: &mut Cursor) -> io::Result { + let len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let ranges = cursor.get_ref().range(offset, len); + cursor.consume(len); - fn from_data(data: SharedData) -> io::Result { - let ranges_len_offset = data.len() - size_of::(); - let ranges_len = (&data[ranges_len_offset..]).read_u64::()?; - let ranges_len = ranges_len as usize; - - let ranges_offset = ranges_len_offset - ranges_len; - let ranges = data.range(ranges_offset, ranges_len); - - let indexes = data.range(0, ranges_offset); + let len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let indexes = cursor.get_ref().range(offset, len); + cursor.consume(len); Ok(DocIndexes { ranges, indexes }) } pub fn write_to_bytes(&self, bytes: &mut Vec) { let ranges_len = self.ranges.len() as u64; - let indexes_len = self.indexes.len() as u64; - let u64_size = size_of::() as u64; - let len = indexes_len + ranges_len + u64_size; - - let _ = bytes.write_u64::(len); - - bytes.extend_from_slice(&self.indexes); - bytes.extend_from_slice(&self.ranges); let _ = bytes.write_u64::(ranges_len); + bytes.extend_from_slice(&self.ranges); + + let indexes_len = self.indexes.len() as u64; + let _ = bytes.write_u64::(indexes_len); + bytes.extend_from_slice(&self.indexes); } pub fn get(&self, index: usize) -> Option<&Set> { @@ -97,12 +94,17 @@ impl Index for DocIndexes { pub struct DocIndexesBuilder { ranges: Vec, + indexes: Vec, wtr: W, } impl DocIndexesBuilder> { pub fn memory() -> Self { - DocIndexesBuilder::new(Vec::new()) + DocIndexesBuilder { + ranges: Vec::new(), + indexes: Vec::new(), + wtr: Vec::new(), + } } } @@ -110,19 +112,18 @@ impl DocIndexesBuilder { pub fn new(wtr: W) -> Self { DocIndexesBuilder { ranges: Vec::new(), + indexes: Vec::new(), wtr: wtr, } } - pub fn insert(&mut self, indexes: &Set) -> io::Result<()> { + pub fn insert(&mut self, indexes: &Set) { let len = indexes.len() as u64; let start = self.ranges.last().map(|r| r.end).unwrap_or(0); let range = Range { start, end: start + len }; self.ranges.push(range); - // write the values - let indexes = unsafe { into_u8_slice(indexes) }; - self.wtr.write_all(indexes) + self.indexes.extend_from_slice(indexes); } pub fn finish(self) -> io::Result<()> { @@ -130,24 +131,20 @@ impl DocIndexesBuilder { } pub fn into_inner(mut self) -> io::Result { - // write the ranges - let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) }; - self.wtr.write_all(ranges)?; - - // write the length of the ranges + let ranges = unsafe { into_u8_slice(&self.ranges) }; let len = ranges.len() as u64; self.wtr.write_u64::(len)?; + self.wtr.write_all(ranges)?; + + let indexes = unsafe { into_u8_slice(&self.indexes) }; + let len = indexes.len() as u64; + self.wtr.write_u64::(len)?; + self.wtr.write_all(indexes)?; Ok(self.wtr) } } -unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { - let ptr = slice.as_ptr() as *const u8; - let len = slice.len() * size_of::(); - from_raw_parts(ptr, len) -} - #[cfg(test)] mod tests { use super::*; @@ -177,9 +174,9 @@ mod tests { let mut builder = DocIndexesBuilder::memory(); - builder.insert(Set::new(&[a])?)?; - builder.insert(Set::new(&[a, b, c])?)?; - builder.insert(Set::new(&[a, c])?)?; + builder.insert(Set::new(&[a])?); + builder.insert(Set::new(&[a, b, c])?); + builder.insert(Set::new(&[a, c])?); let bytes = builder.into_inner()?; let docs = DocIndexes::from_bytes(bytes)?; @@ -212,18 +209,17 @@ mod tests { let mut builder = DocIndexesBuilder::memory(); - builder.insert(Set::new(&[a])?)?; - builder.insert(Set::new(&[a, b, c])?)?; - builder.insert(Set::new(&[a, c])?)?; + builder.insert(Set::new(&[a])?); + builder.insert(Set::new(&[a, b, c])?); + builder.insert(Set::new(&[a, c])?); let builder_bytes = builder.into_inner()?; let docs = DocIndexes::from_bytes(builder_bytes.clone())?; let mut bytes = Vec::new(); docs.write_to_bytes(&mut bytes); - let len = size_of::(); - assert_eq!(builder_bytes, &bytes[len..]); + assert_eq!(builder_bytes, bytes); Ok(()) } diff --git a/src/data/mod.rs b/src/data/mod.rs index 69888dfcf..0e0b0e2c4 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -1,26 +1,30 @@ mod doc_ids; mod doc_indexes; +use std::slice::from_raw_parts; +use std::mem::size_of; use std::ops::Deref; use std::sync::Arc; pub use self::doc_ids::DocIds; pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder}; -#[derive(Clone)] -struct SharedData { - bytes: Arc>, - offset: usize, - len: usize, +#[derive(Default, Clone)] +pub struct SharedData { + pub bytes: Arc>, + pub offset: usize, + pub len: usize, } impl SharedData { - pub fn empty() -> SharedData { - SharedData { - bytes: Arc::default(), - offset: 0, - len: 0, - } + pub fn from_bytes(vec: Vec) -> SharedData { + let len = vec.len(); + let bytes = Arc::new(vec); + SharedData::new(bytes, 0, len) + } + + pub fn new(bytes: Arc>, offset: usize, len: usize) -> SharedData { + SharedData { bytes, offset, len } } pub fn range(&self, offset: usize, len: usize) -> SharedData { @@ -33,12 +37,6 @@ impl SharedData { } } -impl Default for SharedData { - fn default() -> SharedData { - SharedData::empty() - } -} - impl Deref for SharedData { type Target = [u8]; @@ -52,3 +50,9 @@ impl AsRef<[u8]> for SharedData { &self.bytes[self.offset..self.offset + self.len] } } + +unsafe fn into_u8_slice(slice: &[T]) -> &[u8] { + let ptr = slice.as_ptr() as *const u8; + let len = slice.len() * size_of::(); + from_raw_parts(ptr, len) +} diff --git a/src/database/database.rs b/src/database/database.rs index 9b3d76f15..225059300 100644 --- a/src/database/database.rs +++ b/src/database/database.rs @@ -7,7 +7,7 @@ use rocksdb::rocksdb::{Writable, Snapshot}; use rocksdb::{DB, DBVector, MergeOperands}; use crossbeam::atomic::ArcCell; -use crate::database::index::{self, Index, Positive}; +use crate::database::index::Index; use crate::database::{DatabaseView, Update, Schema}; use crate::database::{DATA_INDEX, DATA_SCHEMA}; @@ -86,7 +86,7 @@ impl Database { }; let path = update.path().to_string_lossy(); - let mut options = IngestExternalFileOptions::new(); + let options = IngestExternalFileOptions::new(); // options.move_files(move_update); let cf_handle = db.cf_handle("default").expect("\"default\" column family not found"); @@ -182,7 +182,6 @@ mod tests { }; let database = Database::create(&rocksdb_path, schema.clone())?; - let tokenizer_builder = DefaultBuilder::new(); let update_path = dir.path().join("update.sst"); @@ -201,11 +200,12 @@ mod tests { let docid0; let docid1; - let mut update = { + let update = { + let tokenizer_builder = DefaultBuilder::new(); let mut builder = UpdateBuilder::new(update_path, schema); - docid0 = builder.update_document(&doc0).unwrap(); - docid1 = builder.update_document(&doc1).unwrap(); + docid0 = builder.update_document(&doc0, &tokenizer_builder)?; + docid1 = builder.update_document(&doc1, &tokenizer_builder)?; builder.build()? }; diff --git a/src/database/index/mod.rs b/src/database/index/mod.rs index 0098c5fd2..f9964f1f5 100644 --- a/src/database/index/mod.rs +++ b/src/database/index/mod.rs @@ -4,18 +4,16 @@ mod positive; pub(crate) use self::negative::Negative; pub(crate) use self::positive::{Positive, PositiveBuilder}; -use std::sync::Arc; use std::error::Error; -use std::io::{Cursor, BufRead}; +use std::io::Cursor; +use std::sync::Arc; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use fst::{IntoStreamer, Streamer}; use sdset::duo::DifferenceByKey; use sdset::{Set, SetOperation}; -use fst::raw::Fst; use fst::Map; -use crate::data::{DocIds, DocIndexes}; +use crate::data::{SharedData, DocIndexes}; #[derive(Default)] pub struct Index { @@ -35,8 +33,11 @@ impl Index { len: usize, ) -> Result> { - let (negative, neg_offset) = Negative::from_shared_bytes(bytes.clone(), offset, len)?; - let (positive, _) = Positive::from_shared_bytes(bytes, offset + neg_offset, len)?; + let data = SharedData::new(bytes, offset, len); + let mut cursor = Cursor::new(data); + + let negative = Negative::from_cursor(&mut cursor)?; + let positive = Positive::from_cursor(&mut cursor)?; Ok(Index { negative, positive }) } @@ -71,7 +72,7 @@ impl Index { let (map, indexes) = builder.into_inner()?; let map = Map::from_bytes(map)?; let indexes = DocIndexes::from_bytes(indexes)?; - Positive { map, indexes } + Positive::new(map, indexes) }; let negative = Negative::default(); diff --git a/src/database/index/negative.rs b/src/database/index/negative.rs index e9c30abfc..822c99d20 100644 --- a/src/database/index/negative.rs +++ b/src/database/index/negative.rs @@ -1,46 +1,36 @@ -use std::io::{Cursor, BufRead}; use std::error::Error; -use std::mem::size_of; +use std::io::Cursor; use std::ops::Deref; -use std::sync::Arc; use sdset::Set; -use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; +use byteorder::{LittleEndian, WriteBytesExt}; +use crate::data::SharedData; use crate::data::DocIds; use crate::DocumentId; #[derive(Default)] -pub struct Negative { - pub doc_ids: DocIds, -} +pub struct Negative(DocIds); impl Negative { - pub fn from_shared_bytes( - bytes: Arc>, - offset: usize, - len: usize, - ) -> Result<(Negative, usize), Box> - { - let mut cursor = Cursor::new(&bytes[..len]); - cursor.consume(offset); + pub fn new(doc_ids: DocIds) -> Negative { + Negative(doc_ids) + } - let len = cursor.read_u64::()? as usize; - let offset = cursor.position() as usize; - let doc_ids = DocIds::from_shared_bytes(bytes, offset, len)?; - - Ok((Negative { doc_ids }, offset + len)) + pub fn from_cursor(cursor: &mut Cursor) -> Result> { + let doc_ids = DocIds::from_cursor(cursor)?; + Ok(Negative(doc_ids)) } pub fn write_to_bytes(&self, bytes: &mut Vec) { - let slice = self.doc_ids.as_bytes(); + let slice = self.0.as_bytes(); let len = slice.len() as u64; let _ = bytes.write_u64::(len); bytes.extend_from_slice(slice); } pub fn is_empty(&self) -> bool { - self.doc_ids.doc_ids().is_empty() + self.0.is_empty() } } @@ -48,6 +38,6 @@ impl Deref for Negative { type Target = Set; fn deref(&self) -> &Self::Target { - self.doc_ids.doc_ids() + self.0.as_ref() } } diff --git a/src/database/index/positive.rs b/src/database/index/positive.rs index f72cb94de..d6c3bf3d5 100644 --- a/src/database/index/positive.rs +++ b/src/database/index/positive.rs @@ -1,7 +1,5 @@ use std::io::{Write, BufRead, Cursor}; -use std::mem::size_of; use std::error::Error; -use std::sync::Arc; use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt}; use fst::{map, Map, Streamer, IntoStreamer}; @@ -10,51 +8,51 @@ use sdset::duo::Union; use fst::raw::Fst; use crate::data::{DocIndexes, DocIndexesBuilder}; +use crate::data::SharedData; use crate::DocIndex; #[derive(Default)] pub struct Positive { - pub map: Map, - pub indexes: DocIndexes, + map: Map, + indexes: DocIndexes, } impl Positive { - pub fn from_shared_bytes( - bytes: Arc>, - offset: usize, - len: usize, - ) -> Result<(Positive, usize), Box> - { - let mut cursor = Cursor::new(&bytes[..len]); - cursor.consume(offset); + pub fn new(map: Map, indexes: DocIndexes) -> Positive { + Positive { map, indexes } + } - let map_len = cursor.read_u64::()? as usize; - let map_offset = cursor.position() as usize; - let fst = Fst::from_shared_bytes(bytes.clone(), map_offset, map_len)?; + pub fn from_cursor(cursor: &mut Cursor) -> Result> { + let len = cursor.read_u64::()? as usize; + let offset = cursor.position() as usize; + let data = cursor.get_ref().range(offset, len); + + let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?; let map = Map::from(fst); + cursor.consume(len); - cursor.consume(map_len); - let indexes_len = cursor.read_u64::()? as usize; - let indexes_offset = cursor.position() as usize; - let indexes = DocIndexes::from_shared_bytes(bytes, indexes_offset, indexes_len)?; + let indexes = DocIndexes::from_cursor(cursor)?; - let positive = Positive { map, indexes }; - let len = indexes_offset + indexes_len; - - Ok((positive, len)) + Ok(Positive { map, indexes}) } pub fn write_to_bytes(&self, bytes: &mut Vec) { - // indexes let slice = self.map.as_fst().as_bytes(); let len = slice.len() as u64; let _ = bytes.write_u64::(len); bytes.extend_from_slice(slice); - // map self.indexes.write_to_bytes(bytes); } + pub fn map(&self) -> &Map { + &self.map + } + + pub fn indexes(&self) -> &DocIndexes { + &self.indexes + } + pub fn union(&self, other: &Positive) -> Result> { let mut builder = PositiveBuilder::memory(); let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union(); @@ -155,15 +153,11 @@ impl PositiveBuilder { where K: AsRef<[u8]>, { self.map.insert(key, self.value)?; - self.indexes.insert(indexes)?; + self.indexes.insert(indexes); self.value += 1; Ok(()) } - pub fn finish(self) -> Result<(), Box> { - self.into_inner().map(drop) - } - pub fn into_inner(self) -> Result<(W, X), Box> { let map = self.map.into_inner()?; let indexes = self.indexes.into_inner()?; diff --git a/src/database/schema.rs b/src/database/schema.rs index 5f622e003..60a258824 100644 --- a/src/database/schema.rs +++ b/src/database/schema.rs @@ -141,10 +141,12 @@ impl Schema { attributes } - pub fn document_id(&self, document: &T) -> Result> + pub fn document_id(&self, document: T) -> Result where T: Serialize, { - unimplemented!() + let id_attribute_name = &self.inner.identifier; + let serializer = FindDocumentIdSerializer { id_attribute_name }; + document.serialize(serializer) } pub fn props(&self, attr: SchemaAttr) -> SchemaProps { diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs index bfb0118ed..0bfb6e44a 100644 --- a/src/database/serde/indexer_serializer.rs +++ b/src/database/serde/indexer_serializer.rs @@ -1,4 +1,4 @@ -use crate::database::update::UnorderedPositiveBlobBuilder; +use crate::database::update::RawUpdateBuilder; use crate::database::schema::SchemaAttr; use crate::database::serde::SerializerError; use crate::tokenizer::TokenizerBuilder; @@ -10,7 +10,7 @@ use serde::ser; pub struct IndexerSerializer<'a, B> { pub tokenizer_builder: &'a B, - pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, + pub builder: &'a mut RawUpdateBuilder, pub document_id: DocumentId, pub attribute: SchemaAttr, } @@ -72,10 +72,10 @@ where B: TokenizerBuilder // and the unidecoded lowercased version let word_unidecoded = unidecode::unidecode(word).to_lowercase(); if word_lower != word_unidecoded { - self.builder.insert(word_unidecoded, doc_index); + self.builder.insert_doc_index(word_unidecoded.into_bytes(), doc_index); } - self.builder.insert(word_lower, doc_index); + self.builder.insert_doc_index(word_lower.into_bytes(), doc_index); } Ok(()) } diff --git a/src/database/serde/serializer.rs b/src/database/serde/serializer.rs index d2faed2db..48c58fd0d 100644 --- a/src/database/serde/serializer.rs +++ b/src/database/serde/serializer.rs @@ -1,24 +1,20 @@ -use std::collections::BTreeMap; - use serde::Serialize; use serde::ser; use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer; -use crate::database::update::UnorderedPositiveBlobBuilder; use crate::database::document_key::DocumentKeyAttr; -use crate::database::update::NewState; -use crate::database::Schema; +use crate::database::update::RawUpdateBuilder; use crate::database::serde::SerializerError; use crate::tokenizer::TokenizerBuilder; +use crate::database::schema::Schema; use crate::DocumentId; pub struct Serializer<'a, B> { pub schema: &'a Schema, - pub tokenizer_builder: &'a B, pub document_id: DocumentId, - pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, - pub new_states: &'a mut BTreeMap, + pub tokenizer_builder: &'a B, + pub builder: &'a mut RawUpdateBuilder, } impl<'a, B> ser::Serializer for Serializer<'a, B> @@ -145,7 +141,6 @@ where B: TokenizerBuilder document_id: self.document_id, current_key_name: None, builder: self.builder, - new_states: self.new_states, }) } @@ -160,7 +155,6 @@ where B: TokenizerBuilder tokenizer_builder: self.tokenizer_builder, document_id: self.document_id, builder: self.builder, - new_states: self.new_states, }) } @@ -181,8 +175,7 @@ pub struct MapSerializer<'a, B> { pub tokenizer_builder: &'a B, pub document_id: DocumentId, pub current_key_name: Option, - pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, - pub new_states: &'a mut BTreeMap, + pub builder: &'a mut RawUpdateBuilder, } impl<'a, B> ser::SerializeMap for MapSerializer<'a, B> @@ -220,7 +213,7 @@ where B: TokenizerBuilder if props.is_stored() { let value = bincode::serialize(value).unwrap(); let key = DocumentKeyAttr::new(self.document_id, attr); - self.new_states.insert(key, NewState::Updated { value }); + self.builder.insert_attribute_value(key, value); } if props.is_indexed() { let serializer = IndexerSerializer { @@ -243,10 +236,9 @@ where B: TokenizerBuilder pub struct StructSerializer<'a, B> { pub schema: &'a Schema, - pub tokenizer_builder: &'a B, pub document_id: DocumentId, - pub builder: &'a mut UnorderedPositiveBlobBuilder, Vec>, - pub new_states: &'a mut BTreeMap, + pub tokenizer_builder: &'a B, + pub builder: &'a mut RawUpdateBuilder, } impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B> @@ -267,7 +259,7 @@ where B: TokenizerBuilder if props.is_stored() { let value = bincode::serialize(value).unwrap(); let key = DocumentKeyAttr::new(self.document_id, attr); - self.new_states.insert(key, NewState::Updated { value }); + self.builder.insert_attribute_value(key, value); } if props.is_indexed() { let serializer = IndexerSerializer { diff --git a/src/database/update/builder.rs b/src/database/update/builder.rs index c3bdf59fc..344eb84e4 100644 --- a/src/database/update/builder.rs +++ b/src/database/update/builder.rs @@ -1,95 +1,60 @@ -use std::collections::{BTreeMap, BTreeSet}; use std::path::PathBuf; use std::error::Error; -use fst::map::{Map, MapBuilder}; -use rocksdb::rocksdb_options; use serde::Serialize; -use sdset::Set; -use crate::database::index::{Index, Positive, PositiveBuilder, Negative}; -use crate::database::{DATA_INDEX, Schema, DocumentKeyAttr}; -use crate::data::{DocIds, DocIndexes}; -use crate::{DocumentId, DocIndex}; -use super::Update; +use crate::database::serde::serializer::Serializer; +use crate::database::serde::SerializerError; +use crate::tokenizer::TokenizerBuilder; +use crate::database::Schema; -type Token = Vec; // TODO could be replaced by a SmallVec -type Value = Vec; +use crate::DocumentId; +use super::{Update, RawUpdateBuilder}; pub struct UpdateBuilder { - sst_file: PathBuf, schema: Schema, - removed_documents: BTreeSet, - words_indexes: BTreeMap>, - keys_values: BTreeMap, + raw_builder: RawUpdateBuilder, } impl UpdateBuilder { pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder { UpdateBuilder { - sst_file: path, schema: schema, - removed_documents: BTreeSet::new(), - words_indexes: BTreeMap::new(), - keys_values: BTreeMap::new(), + raw_builder: RawUpdateBuilder::new(path), } } - pub fn update_document(&mut self, document: T) -> Result> + pub fn update_document( + &mut self, + document: T, + tokenizer_builder: &B, + ) -> Result where T: Serialize, + B: TokenizerBuilder, { - unimplemented!() + let document_id = self.schema.document_id(&document)?; + + let serializer = Serializer { + schema: &self.schema, + document_id: document_id, + tokenizer_builder: tokenizer_builder, + builder: &mut self.raw_builder, + }; + + document.serialize(serializer)?; + + Ok(document_id) } - pub fn remove_document(&mut self, document: T) -> Result> + pub fn remove_document(&mut self, document: T) -> Result where T: Serialize, { - unimplemented!() + let document_id = self.schema.document_id(&document)?; + self.raw_builder.remove_document(document_id); + Ok(document_id) } pub fn build(self) -> Result> { - let tree = { - let negative = { - let documents_ids = self.removed_documents.into_iter().collect(); - let doc_ids = DocIds::from_raw(documents_ids); - Negative { doc_ids } - }; - - let positive = { - let mut builder = PositiveBuilder::memory(); - - for (key, mut indexes) in self.words_indexes { - indexes.sort_unstable(); - let indexes = Set::new_unchecked(&indexes); - builder.insert(key, indexes); - } - - let (map, indexes) = builder.into_inner()?; - let map = Map::from_bytes(map)?; - let indexes = DocIndexes::from_bytes(indexes)?; - Positive { map, indexes } - }; - - Index { negative, positive } - }; - - let env_options = rocksdb_options::EnvOptions::new(); - let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); - let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); - file_writer.open(&self.sst_file.to_string_lossy())?; - - // write the data-index - let mut bytes = Vec::new(); - tree.write_to_bytes(&mut bytes); - file_writer.merge(DATA_INDEX, &bytes)?; - - // write all the documents attributes updates - for (key, value) in self.keys_values { - file_writer.put(key.as_ref(), &value)?; - } - - file_writer.finish()?; - - Ok(Update { sst_file: self.sst_file }) + self.raw_builder.build() } } diff --git a/src/database/update/mod.rs b/src/database/update/mod.rs index 7bdda9949..3e3eb8cca 100644 --- a/src/database/update/mod.rs +++ b/src/database/update/mod.rs @@ -1,8 +1,10 @@ use std::path::{Path, PathBuf}; mod builder; +mod raw_builder; pub use self::builder::UpdateBuilder; +pub use self::raw_builder::RawUpdateBuilder; pub struct Update { sst_file: PathBuf, diff --git a/src/database/update/raw_builder.rs b/src/database/update/raw_builder.rs new file mode 100644 index 000000000..e7e65a5fc --- /dev/null +++ b/src/database/update/raw_builder.rs @@ -0,0 +1,93 @@ +use std::collections::{BTreeMap, BTreeSet}; +use std::path::PathBuf; +use std::error::Error; + +use rocksdb::rocksdb_options; +use fst::map::Map; +use sdset::Set; + +use crate::database::index::{Index, Positive, PositiveBuilder, Negative}; +use crate::database::{DATA_INDEX, DocumentKeyAttr}; +use crate::data::{DocIds, DocIndexes}; +use crate::{DocumentId, DocIndex}; +use super::Update; + +type Token = Vec; // TODO could be replaced by a SmallVec +type Value = Vec; + +pub struct RawUpdateBuilder { + sst_file: PathBuf, + removed_documents: BTreeSet, + words_indexes: BTreeMap>, + keys_values: BTreeMap, +} + +impl RawUpdateBuilder { + pub fn new(path: PathBuf) -> RawUpdateBuilder { + RawUpdateBuilder { + sst_file: path, + removed_documents: BTreeSet::new(), + words_indexes: BTreeMap::new(), + keys_values: BTreeMap::new(), + } + } + + pub fn insert_doc_index(&mut self, token: Vec, doc_index: DocIndex) { + self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index) + } + + pub fn insert_attribute_value(&mut self, key_attr: DocumentKeyAttr, value: Vec) -> Option> { + self.keys_values.insert(key_attr, value) + } + + pub fn remove_document(&mut self, id: DocumentId) { + self.removed_documents.insert(id); + } + + pub fn build(self) -> Result> { + let tree = { + let negative = { + let documents_ids: Vec<_> = self.removed_documents.into_iter().collect(); + let documents_ids = Set::new_unchecked(&documents_ids); + let doc_ids = DocIds::new(documents_ids); + Negative::new(doc_ids) + }; + + let positive = { + let mut builder = PositiveBuilder::memory(); + + for (key, mut indexes) in self.words_indexes { + indexes.sort_unstable(); + let indexes = Set::new_unchecked(&indexes); + builder.insert(key, indexes)?; + } + + let (map, indexes) = builder.into_inner()?; + let map = Map::from_bytes(map)?; + let indexes = DocIndexes::from_bytes(indexes)?; + Positive::new(map, indexes) + }; + + Index { negative, positive } + }; + + let env_options = rocksdb_options::EnvOptions::new(); + let column_family_options = rocksdb_options::ColumnFamilyOptions::new(); + let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options); + file_writer.open(&self.sst_file.to_string_lossy())?; + + // write the data-index + let mut bytes = Vec::new(); + tree.write_to_bytes(&mut bytes); + file_writer.merge(DATA_INDEX, &bytes)?; + + // write all the documents attributes updates + for (key, value) in self.keys_values { + file_writer.put(key.as_ref(), &value)?; + } + + file_writer.finish()?; + + Ok(Update { sst_file: self.sst_file }) + } +} diff --git a/src/rank/query_builder.rs b/src/rank/query_builder.rs index a5bbdf885..5e4ee0d11 100644 --- a/src/rank/query_builder.rs +++ b/src/rank/query_builder.rs @@ -86,7 +86,7 @@ where D: Deref, let mut stream = { let mut op_builder = fst::map::OpBuilder::new(); for automaton in &automatons { - let stream = self.view.index().positive.map.search(automaton); + let stream = self.view.index().positive.map().search(automaton); op_builder.push(stream); } op_builder.union() @@ -100,7 +100,7 @@ where D: Deref, let distance = automaton.eval(input).to_u8(); let is_exact = distance == 0 && input.len() == automaton.query_len(); - let doc_indexes = &self.view.index().positive.indexes; + let doc_indexes = &self.view.index().positive.indexes(); let doc_indexes = &doc_indexes[iv.value as usize]; for doc_index in doc_indexes { From 21bb38c3b0396f0ef638b01c2da3eb4d00cfeb09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20Renault?= Date: Mon, 31 Dec 2018 19:02:30 +0100 Subject: [PATCH 5/5] test: Add more tests for updates ingestion --- src/database/database.rs | 98 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 97 insertions(+), 1 deletion(-) diff --git a/src/database/database.rs b/src/database/database.rs index 225059300..467a1b49a 100644 --- a/src/database/database.rs +++ b/src/database/database.rs @@ -159,7 +159,7 @@ mod tests { use crate::tokenizer::DefaultBuilder; #[test] - fn ingest_update_file() -> Result<(), Box> { + fn ingest_one_update_file() -> Result<(), Box> { let dir = tempdir()?; let rocksdb_path = dir.path().join("rocksdb.rdb"); @@ -221,4 +221,100 @@ mod tests { Ok(dir.close()?) } + + #[test] + fn ingest_two_update_files() -> Result<(), Box> { + let dir = tempdir()?; + + let rocksdb_path = dir.path().join("rocksdb.rdb"); + + #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] + struct SimpleDoc { + id: u64, + title: String, + description: String, + timestamp: u64, + } + + let schema = { + let mut builder = SchemaBuilder::with_identifier("id"); + builder.new_attribute("id", STORED); + builder.new_attribute("title", STORED | INDEXED); + builder.new_attribute("description", STORED | INDEXED); + builder.new_attribute("timestamp", STORED); + builder.build() + }; + + let database = Database::create(&rocksdb_path, schema.clone())?; + + let doc0 = SimpleDoc { + id: 0, + title: String::from("I am a title"), + description: String::from("I am a description"), + timestamp: 1234567, + }; + let doc1 = SimpleDoc { + id: 1, + title: String::from("I am the second title"), + description: String::from("I am the second description"), + timestamp: 7654321, + }; + let doc2 = SimpleDoc { + id: 2, + title: String::from("I am the third title"), + description: String::from("I am the third description"), + timestamp: 7654321, + }; + let doc3 = SimpleDoc { + id: 3, + title: String::from("I am the fourth title"), + description: String::from("I am the fourth description"), + timestamp: 7654321, + }; + + let docid0; + let docid1; + let update1 = { + let tokenizer_builder = DefaultBuilder::new(); + let update_path = dir.path().join("update-000.sst"); + let mut builder = UpdateBuilder::new(update_path, schema.clone()); + + docid0 = builder.update_document(&doc0, &tokenizer_builder)?; + docid1 = builder.update_document(&doc1, &tokenizer_builder)?; + + builder.build()? + }; + + let docid2; + let docid3; + let update2 = { + let tokenizer_builder = DefaultBuilder::new(); + let update_path = dir.path().join("update-001.sst"); + let mut builder = UpdateBuilder::new(update_path, schema); + + docid2 = builder.update_document(&doc2, &tokenizer_builder)?; + docid3 = builder.update_document(&doc3, &tokenizer_builder)?; + + builder.build()? + }; + + database.ingest_update_file(update1)?; + database.ingest_update_file(update2)?; + + let view = database.view(); + + let de_doc0: SimpleDoc = view.document_by_id(docid0)?; + let de_doc1: SimpleDoc = view.document_by_id(docid1)?; + + assert_eq!(doc0, de_doc0); + assert_eq!(doc1, de_doc1); + + let de_doc2: SimpleDoc = view.document_by_id(docid2)?; + let de_doc3: SimpleDoc = view.document_by_id(docid3)?; + + assert_eq!(doc2, de_doc2); + assert_eq!(doc3, de_doc3); + + Ok(dir.close()?) + } }