mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-29 16:24:26 +01:00
Merge pull request #56 from Kerollmops/new-index-struct
New Index structure
This commit is contained in:
commit
a842e647f7
@ -5,7 +5,7 @@ use serde_derive::{Serialize, Deserialize};
|
||||
use structopt::StructOpt;
|
||||
|
||||
use meilidb::database::schema::{Schema, SchemaBuilder, STORED, INDEXED};
|
||||
use meilidb::database::PositiveUpdateBuilder;
|
||||
use meilidb::database::UpdateBuilder;
|
||||
use meilidb::tokenizer::DefaultBuilder;
|
||||
use meilidb::database::Database;
|
||||
|
||||
@ -44,7 +44,7 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<D
|
||||
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = tempfile::NamedTempFile::new()?;
|
||||
let mut update = PositiveUpdateBuilder::new(update_path.path(), schema, tokenizer_builder);
|
||||
let mut update = UpdateBuilder::new(update_path.path().to_path_buf(), schema);
|
||||
|
||||
let mut rdr = csv::Reader::from_path(csv_data_path)?;
|
||||
let mut raw_record = csv::StringRecord::new();
|
||||
@ -59,12 +59,10 @@ fn index(schema: Schema, database_path: &Path, csv_data_path: &Path) -> Result<D
|
||||
}
|
||||
};
|
||||
|
||||
update.update(&document).unwrap();
|
||||
update.update_document(&document, &tokenizer_builder)?;
|
||||
}
|
||||
|
||||
let mut update = update.build()?;
|
||||
|
||||
update.set_move(true);
|
||||
let update = update.build()?;
|
||||
database.ingest_update_file(update)?;
|
||||
|
||||
Ok(database)
|
||||
|
@ -1,59 +1,54 @@
|
||||
use std::io::{self, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::{io, mem};
|
||||
use std::mem::size_of;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use sdset::Set;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
|
||||
use crate::DocumentId;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Default, Clone)]
|
||||
pub struct DocIds {
|
||||
data: Data,
|
||||
}
|
||||
pub struct DocIds(SharedData);
|
||||
|
||||
impl DocIds {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
let data = Data::Mmap(mmap);
|
||||
Ok(DocIds { data })
|
||||
pub fn new(ids: &Set<DocumentId>) -> DocIds {
|
||||
let bytes = unsafe { into_u8_slice(ids.as_slice()) };
|
||||
let data = SharedData::from_bytes(bytes.to_vec());
|
||||
DocIds(data)
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
// FIXME check if modulo DocumentId
|
||||
let len = vec.len();
|
||||
let data = Data::Shared {
|
||||
bytes: Arc::new(vec),
|
||||
offset: 0,
|
||||
len: len
|
||||
};
|
||||
Ok(DocIds { data })
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIds> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let doc_ids = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIds(doc_ids))
|
||||
}
|
||||
|
||||
pub fn from_document_ids(vec: Vec<DocumentId>) -> Self {
|
||||
DocIds::from_bytes(unsafe { mem::transmute(vec) }).unwrap()
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let len = self.0.len() as u64;
|
||||
bytes.write_u64::<LittleEndian>(len).unwrap();
|
||||
bytes.extend_from_slice(&self.0);
|
||||
}
|
||||
|
||||
pub fn contains(&self, doc: DocumentId) -> bool {
|
||||
// FIXME prefer using the sdset::exponential_search function
|
||||
self.doc_ids().binary_search(&doc).is_ok()
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
|
||||
pub fn doc_ids(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.data;
|
||||
pub fn as_bytes(&self) -> &[u8] {
|
||||
&self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for DocIds {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
let slice = &self.0;
|
||||
let ptr = slice.as_ptr() as *const DocumentId;
|
||||
let len = slice.len() / mem::size_of::<DocumentId>();
|
||||
let len = slice.len() / size_of::<DocumentId>();
|
||||
let slice = unsafe { from_raw_parts(ptr, len) };
|
||||
Set::new_unchecked(slice)
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for DocIds {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
self.data.as_ref().serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
@ -1,16 +1,15 @@
|
||||
use std::io::{self, Write, Cursor, BufRead};
|
||||
use std::slice::from_raw_parts;
|
||||
use std::io::{self, Write};
|
||||
use std::mem::size_of;
|
||||
use std::ops::Index;
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::raw::MmapReadOnly;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::Data;
|
||||
use crate::data::SharedData;
|
||||
use super::into_u8_slice;
|
||||
|
||||
#[derive(Debug)]
|
||||
#[repr(C)]
|
||||
@ -21,52 +20,45 @@ struct Range {
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub struct DocIndexes {
|
||||
ranges: Data,
|
||||
indexes: Data,
|
||||
ranges: SharedData,
|
||||
indexes: SharedData,
|
||||
}
|
||||
|
||||
impl DocIndexes {
|
||||
pub unsafe fn from_path<P: AsRef<Path>>(path: P) -> io::Result<Self> {
|
||||
let mmap = MmapReadOnly::open_path(path)?;
|
||||
DocIndexes::from_data(Data::Mmap(mmap))
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> io::Result<DocIndexes> {
|
||||
let bytes = Arc::new(bytes);
|
||||
let len = bytes.len();
|
||||
let data = SharedData::new(bytes, 0, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
DocIndexes::from_cursor(&mut cursor)
|
||||
}
|
||||
|
||||
pub fn from_bytes(vec: Vec<u8>) -> io::Result<Self> {
|
||||
let len = vec.len();
|
||||
DocIndexes::from_shared_bytes(Arc::new(vec), 0, len)
|
||||
}
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> io::Result<DocIndexes> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let ranges = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> io::Result<Self> {
|
||||
let data = Data::Shared { bytes, offset, len };
|
||||
DocIndexes::from_data(data)
|
||||
}
|
||||
|
||||
fn from_data(data: Data) -> io::Result<Self> {
|
||||
let ranges_len_offset = data.len() - size_of::<u64>();
|
||||
let ranges_len = (&data[ranges_len_offset..]).read_u64::<LittleEndian>()?;
|
||||
let ranges_len = ranges_len as usize;
|
||||
|
||||
let ranges_offset = ranges_len_offset - ranges_len;
|
||||
let ranges = data.range(ranges_offset, ranges_len);
|
||||
|
||||
let indexes = data.range(0, ranges_offset);
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let indexes = cursor.get_ref().range(offset, len);
|
||||
cursor.consume(len);
|
||||
|
||||
Ok(DocIndexes { ranges, indexes })
|
||||
}
|
||||
|
||||
pub fn to_vec(&self) -> Vec<u8> {
|
||||
let capacity = self.indexes.len() + self.ranges.len() + size_of::<u64>();
|
||||
let mut bytes = Vec::with_capacity(capacity);
|
||||
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let ranges_len = self.ranges.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(ranges_len);
|
||||
bytes.extend_from_slice(&self.ranges);
|
||||
bytes.write_u64::<LittleEndian>(self.ranges.len() as u64).unwrap();
|
||||
|
||||
bytes
|
||||
let indexes_len = self.indexes.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(indexes_len);
|
||||
bytes.extend_from_slice(&self.indexes);
|
||||
}
|
||||
|
||||
pub fn get(&self, index: usize) -> Option<&Set<DocIndex>> {
|
||||
self.ranges().get(index as usize).map(|Range { start, end }| {
|
||||
self.ranges().get(index).map(|Range { start, end }| {
|
||||
let start = *start as usize;
|
||||
let end = *end as usize;
|
||||
let slice = &self.indexes()[start..end];
|
||||
@ -102,12 +94,17 @@ impl Index<usize> for DocIndexes {
|
||||
|
||||
pub struct DocIndexesBuilder<W> {
|
||||
ranges: Vec<Range>,
|
||||
indexes: Vec<DocIndex>,
|
||||
wtr: W,
|
||||
}
|
||||
|
||||
impl DocIndexesBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
DocIndexesBuilder::new(Vec::new())
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -115,19 +112,18 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
pub fn new(wtr: W) -> Self {
|
||||
DocIndexesBuilder {
|
||||
ranges: Vec::new(),
|
||||
indexes: Vec::new(),
|
||||
wtr: wtr,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) -> io::Result<()> {
|
||||
pub fn insert(&mut self, indexes: &Set<DocIndex>) {
|
||||
let len = indexes.len() as u64;
|
||||
let start = self.ranges.last().map(|r| r.end).unwrap_or(0);
|
||||
let range = Range { start, end: start + len };
|
||||
self.ranges.push(range);
|
||||
|
||||
// write the values
|
||||
let indexes = unsafe { into_u8_slice(indexes) };
|
||||
self.wtr.write_all(indexes)
|
||||
self.indexes.extend_from_slice(indexes);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> io::Result<()> {
|
||||
@ -135,24 +131,20 @@ impl<W: Write> DocIndexesBuilder<W> {
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
// write the ranges
|
||||
let ranges = unsafe { into_u8_slice(self.ranges.as_slice()) };
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
// write the length of the ranges
|
||||
let ranges = unsafe { into_u8_slice(&self.ranges) };
|
||||
let len = ranges.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(ranges)?;
|
||||
|
||||
let indexes = unsafe { into_u8_slice(&self.indexes) };
|
||||
let len = indexes.len() as u64;
|
||||
self.wtr.write_u64::<LittleEndian>(len)?;
|
||||
self.wtr.write_all(indexes)?;
|
||||
|
||||
Ok(self.wtr)
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@ -182,9 +174,9 @@ mod tests {
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(bytes)?;
|
||||
@ -217,13 +209,15 @@ mod tests {
|
||||
|
||||
let mut builder = DocIndexesBuilder::memory();
|
||||
|
||||
builder.insert(Set::new(&[a])?)?;
|
||||
builder.insert(Set::new(&[a, b, c])?)?;
|
||||
builder.insert(Set::new(&[a, c])?)?;
|
||||
builder.insert(Set::new(&[a])?);
|
||||
builder.insert(Set::new(&[a, b, c])?);
|
||||
builder.insert(Set::new(&[a, c])?);
|
||||
|
||||
let builder_bytes = builder.into_inner()?;
|
||||
let docs = DocIndexes::from_bytes(builder_bytes.clone())?;
|
||||
let bytes = docs.to_vec();
|
||||
|
||||
let mut bytes = Vec::new();
|
||||
docs.write_to_bytes(&mut bytes);
|
||||
|
||||
assert_eq!(builder_bytes, bytes);
|
||||
|
||||
|
@ -1,51 +1,43 @@
|
||||
mod doc_ids;
|
||||
mod doc_indexes;
|
||||
|
||||
use std::slice::from_raw_parts;
|
||||
use std::mem::size_of;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::raw::MmapReadOnly;
|
||||
|
||||
pub use self::doc_ids::DocIds;
|
||||
pub use self::doc_indexes::{DocIndexes, DocIndexesBuilder};
|
||||
|
||||
#[derive(Clone)]
|
||||
enum Data {
|
||||
Shared {
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
},
|
||||
Mmap(MmapReadOnly),
|
||||
#[derive(Default, Clone)]
|
||||
pub struct SharedData {
|
||||
pub bytes: Arc<Vec<u8>>,
|
||||
pub offset: usize,
|
||||
pub len: usize,
|
||||
}
|
||||
|
||||
impl Data {
|
||||
pub fn range(&self, off: usize, l: usize) -> Data {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
assert!(off + l <= *len);
|
||||
Data::Shared {
|
||||
bytes: bytes.clone(),
|
||||
offset: offset + off,
|
||||
len: l,
|
||||
impl SharedData {
|
||||
pub fn from_bytes(vec: Vec<u8>) -> SharedData {
|
||||
let len = vec.len();
|
||||
let bytes = Arc::new(vec);
|
||||
SharedData::new(bytes, 0, len)
|
||||
}
|
||||
},
|
||||
Data::Mmap(mmap) => Data::Mmap(mmap.range(off, l)),
|
||||
|
||||
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData {
|
||||
SharedData { bytes, offset, len }
|
||||
}
|
||||
|
||||
pub fn range(&self, offset: usize, len: usize) -> SharedData {
|
||||
assert!(offset + len <= self.len);
|
||||
SharedData {
|
||||
bytes: self.bytes.clone(),
|
||||
offset: self.offset + offset,
|
||||
len: len,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for Data {
|
||||
fn default() -> Data {
|
||||
Data::Shared {
|
||||
bytes: Arc::default(),
|
||||
offset: 0,
|
||||
len: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Data {
|
||||
impl Deref for SharedData {
|
||||
type Target = [u8];
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
@ -53,13 +45,14 @@ impl Deref for Data {
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<[u8]> for Data {
|
||||
impl AsRef<[u8]> for SharedData {
|
||||
fn as_ref(&self) -> &[u8] {
|
||||
match self {
|
||||
Data::Shared { bytes, offset, len } => {
|
||||
&bytes[*offset..offset + len]
|
||||
},
|
||||
Data::Mmap(m) => m.as_slice(),
|
||||
}
|
||||
&self.bytes[self.offset..self.offset + self.len]
|
||||
}
|
||||
}
|
||||
|
||||
unsafe fn into_u8_slice<T: Sized>(slice: &[T]) -> &[u8] {
|
||||
let ptr = slice.as_ptr() as *const u8;
|
||||
let len = slice.len() * size_of::<T>();
|
||||
from_raw_parts(ptr, len)
|
||||
}
|
||||
|
@ -1,110 +0,0 @@
|
||||
mod ops;
|
||||
pub mod positive;
|
||||
pub mod negative;
|
||||
|
||||
pub use self::positive::{PositiveBlob, PositiveBlobBuilder};
|
||||
pub use self::negative::NegativeBlob;
|
||||
pub use self::ops::OpBuilder;
|
||||
|
||||
use std::fmt;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum Blob {
|
||||
Positive(PositiveBlob),
|
||||
Negative(NegativeBlob),
|
||||
}
|
||||
|
||||
impl Blob {
|
||||
pub fn is_negative(&self) -> bool {
|
||||
self.sign() == Sign::Negative
|
||||
}
|
||||
|
||||
pub fn is_positive(&self) -> bool {
|
||||
self.sign() == Sign::Positive
|
||||
}
|
||||
|
||||
pub fn sign(&self) -> Sign {
|
||||
match self {
|
||||
Blob::Positive(_) => Sign::Positive,
|
||||
Blob::Negative(_) => Sign::Negative,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for Blob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
match self {
|
||||
Blob::Positive(blob) => {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&Sign::Positive)?;
|
||||
tuple.serialize_element(&blob)?;
|
||||
tuple.end()
|
||||
},
|
||||
Blob::Negative(blob) => {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&Sign::Negative)?;
|
||||
tuple.serialize_element(&blob)?;
|
||||
tuple.end()
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for Blob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<Blob, D::Error> {
|
||||
struct TupleVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for TupleVisitor {
|
||||
type Value = Blob;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a Blob struct")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
|
||||
let sign = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(0, &self)),
|
||||
};
|
||||
match sign {
|
||||
Sign::Positive => {
|
||||
let blob = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
Ok(Blob::Positive(blob))
|
||||
},
|
||||
Sign::Negative => {
|
||||
let blob = match seq.next_element()? {
|
||||
Some(value) => value,
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
Ok(Blob::Negative(blob))
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_tuple(2, TupleVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum Sign {
|
||||
Positive,
|
||||
Negative,
|
||||
}
|
||||
|
||||
impl Sign {
|
||||
pub fn invert(self) -> Sign {
|
||||
match self {
|
||||
Sign::Positive => Sign::Negative,
|
||||
Sign::Negative => Sign::Positive,
|
||||
}
|
||||
}
|
||||
}
|
@ -1,67 +0,0 @@
|
||||
use std::error::Error;
|
||||
use std::path::Path;
|
||||
use std::fmt;
|
||||
|
||||
use sdset::Set;
|
||||
use serde::de::{self, Deserialize, Deserializer};
|
||||
use serde::ser::{Serialize, Serializer};
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct NegativeBlob {
|
||||
doc_ids: DocIds,
|
||||
}
|
||||
|
||||
impl NegativeBlob {
|
||||
pub unsafe fn from_path<P>(doc_ids: P) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
{
|
||||
let doc_ids = DocIds::from_path(doc_ids)?;
|
||||
Ok(NegativeBlob { doc_ids })
|
||||
}
|
||||
|
||||
pub fn from_bytes(doc_ids: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let doc_ids = DocIds::from_bytes(doc_ids)?;
|
||||
Ok(NegativeBlob { doc_ids })
|
||||
}
|
||||
|
||||
pub fn from_raw(doc_ids: DocIds) -> Self {
|
||||
NegativeBlob { doc_ids }
|
||||
}
|
||||
|
||||
pub fn as_ids(&self) -> &DocIds {
|
||||
&self.doc_ids
|
||||
}
|
||||
|
||||
pub fn into_doc_ids(self) -> DocIds {
|
||||
self.doc_ids
|
||||
}
|
||||
}
|
||||
|
||||
impl AsRef<Set<DocumentId>> for NegativeBlob {
|
||||
fn as_ref(&self) -> &Set<DocumentId> {
|
||||
self.as_ids().doc_ids()
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for NegativeBlob {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "NegativeBlob(")?;
|
||||
f.debug_list().entries(self.as_ref().as_slice()).finish()?;
|
||||
write!(f, ")")
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for NegativeBlob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
self.doc_ids.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for NegativeBlob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<NegativeBlob, D::Error> {
|
||||
let bytes = Vec::deserialize(deserializer)?;
|
||||
NegativeBlob::from_bytes(bytes).map_err(de::Error::custom)
|
||||
}
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
mod blob;
|
||||
mod ops;
|
||||
|
||||
pub use self::blob::NegativeBlob;
|
||||
pub use self::ops::OpBuilder;
|
@ -1,73 +0,0 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::blob::NegativeBlob;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct OpBuilder<'a> {
|
||||
inner: SdOpBuilder<'a, DocumentId>,
|
||||
}
|
||||
|
||||
/// Do a set operation on multiple negative blobs.
|
||||
impl<'a> OpBuilder<'a> {
|
||||
pub fn new() -> Self {
|
||||
Self { inner: SdOpBuilder::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> Self {
|
||||
Self { inner: SdOpBuilder::with_capacity(cap) }
|
||||
}
|
||||
|
||||
pub fn add(mut self, blob: &'a NegativeBlob) -> Self {
|
||||
self.push(blob);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: &'a NegativeBlob) {
|
||||
let set = Set::new_unchecked(blob.as_ref());
|
||||
self.inner.push(set);
|
||||
}
|
||||
|
||||
pub fn union(self) -> Union<'a> {
|
||||
Union::new(self.inner.union())
|
||||
}
|
||||
|
||||
pub fn intersection(self) -> Intersection<'a> {
|
||||
Intersection::new(self.inner.intersection())
|
||||
}
|
||||
|
||||
pub fn difference(self) -> Difference<'a> {
|
||||
Difference::new(self.inner.difference())
|
||||
}
|
||||
|
||||
pub fn symmetric_difference(self) -> SymmetricDifference<'a> {
|
||||
SymmetricDifference::new(self.inner.symmetric_difference())
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! logical_operation {
|
||||
(struct $name:ident, $operation:ident) => {
|
||||
|
||||
pub struct $name<'a> {
|
||||
op: sdset::multi::$name<'a, DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> $name<'a> {
|
||||
fn new(op: sdset::multi::$name<'a, DocumentId>) -> Self {
|
||||
$name { op }
|
||||
}
|
||||
|
||||
pub fn into_negative_blob(self) -> NegativeBlob {
|
||||
let document_ids = sdset::SetOperation::into_set_buf(self.op);
|
||||
let doc_ids = DocIds::from_document_ids(document_ids.into_vec());
|
||||
NegativeBlob::from_raw(doc_ids)
|
||||
}
|
||||
}
|
||||
|
||||
}}
|
||||
|
||||
logical_operation!(struct Union, union);
|
||||
logical_operation!(struct Intersection, intersection);
|
||||
logical_operation!(struct Difference, difference);
|
||||
logical_operation!(struct SymmetricDifference, symmetric_difference);
|
@ -1,109 +0,0 @@
|
||||
use std::error::Error;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use group_by::GroupBy;
|
||||
|
||||
use crate::database::blob::{Blob, Sign, PositiveBlob, PositiveBlobBuilder, NegativeBlob};
|
||||
use crate::database::blob::{positive, negative};
|
||||
|
||||
fn blob_same_sign(a: &Blob, b: &Blob) -> bool {
|
||||
a.sign() == b.sign()
|
||||
}
|
||||
|
||||
fn unwrap_positive(blob: &Blob) -> &PositiveBlob {
|
||||
match blob {
|
||||
Blob::Positive(blob) => blob,
|
||||
Blob::Negative(_) => panic!("called `unwrap_positive()` on a `Negative` value"),
|
||||
}
|
||||
}
|
||||
|
||||
fn unwrap_negative(blob: &Blob) -> &NegativeBlob {
|
||||
match blob {
|
||||
Blob::Negative(blob) => blob,
|
||||
Blob::Positive(_) => panic!("called `unwrap_negative()` on a `Positive` value"),
|
||||
}
|
||||
}
|
||||
|
||||
pub struct OpBuilder {
|
||||
blobs: Vec<Blob>,
|
||||
}
|
||||
|
||||
impl OpBuilder {
|
||||
pub fn new() -> OpBuilder {
|
||||
OpBuilder { blobs: Vec::new() }
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> OpBuilder {
|
||||
OpBuilder { blobs: Vec::with_capacity(cap) }
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: Blob) {
|
||||
if self.blobs.is_empty() && blob.is_negative() { return }
|
||||
self.blobs.push(blob);
|
||||
}
|
||||
|
||||
pub fn merge(self) -> Result<PositiveBlob, Box<Error>> {
|
||||
let groups = GroupBy::new(&self.blobs, blob_same_sign);
|
||||
let mut aggregated = Vec::new();
|
||||
|
||||
for blobs in groups {
|
||||
match blobs[0].sign() {
|
||||
Sign::Positive => {
|
||||
let mut op_builder = positive::OpBuilder::with_capacity(blobs.len());
|
||||
for blob in blobs {
|
||||
op_builder.push(unwrap_positive(blob));
|
||||
}
|
||||
|
||||
let mut stream = op_builder.union().into_stream();
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
while let Some((input, doc_indexes)) = stream.next() {
|
||||
// FIXME empty doc_indexes must be handled by OpBuilder
|
||||
if !doc_indexes.is_empty() {
|
||||
builder.insert(input, doc_indexes).unwrap();
|
||||
}
|
||||
}
|
||||
let (map, doc_indexes) = builder.into_inner().unwrap();
|
||||
let blob = PositiveBlob::from_bytes(map, doc_indexes).unwrap();
|
||||
aggregated.push(Blob::Positive(blob));
|
||||
},
|
||||
Sign::Negative => {
|
||||
let mut op_builder = negative::OpBuilder::with_capacity(blobs.len());
|
||||
for blob in blobs {
|
||||
op_builder.push(unwrap_negative(blob));
|
||||
}
|
||||
let blob = op_builder.union().into_negative_blob();
|
||||
aggregated.push(Blob::Negative(blob));
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
aggregated.chunks(2).try_fold(PositiveBlob::default(), |base, slice| {
|
||||
let negative = NegativeBlob::default();
|
||||
let (positive, negative) = match slice {
|
||||
[a, b] => (unwrap_positive(a), unwrap_negative(b)),
|
||||
[a] => (unwrap_positive(a), &negative),
|
||||
_ => unreachable!(),
|
||||
};
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
let op_builder = positive::OpBuilder::new().add(&base).add(&positive);
|
||||
let mut stream = op_builder.union().into_stream();
|
||||
while let Some((input, doc_indexes)) = stream.next() {
|
||||
let op = DifferenceByKey::new(doc_indexes, negative.as_ref(), |x| x.document_id, |x| *x);
|
||||
|
||||
buffer.clear();
|
||||
op.extend_vec(&mut buffer);
|
||||
if !buffer.is_empty() {
|
||||
builder.insert(input, Set::new_unchecked(&buffer))?;
|
||||
}
|
||||
}
|
||||
|
||||
let (map, doc_indexes) = builder.into_inner()?;
|
||||
PositiveBlob::from_bytes(map, doc_indexes)
|
||||
})
|
||||
}
|
||||
}
|
@ -1,282 +0,0 @@
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::path::Path;
|
||||
use std::error::Error;
|
||||
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
use sdset::Set;
|
||||
|
||||
use crate::DocIndex;
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use serde::ser::{Serialize, Serializer, SerializeTuple};
|
||||
use serde::de::{self, Deserialize, Deserializer, SeqAccess, Visitor};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct PositiveBlob {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl PositiveBlob {
|
||||
pub unsafe fn from_paths<P, Q>(map: P, indexes: Q) -> Result<Self, Box<Error>>
|
||||
where P: AsRef<Path>,
|
||||
Q: AsRef<Path>,
|
||||
{
|
||||
let map = Map::from_path(map)?;
|
||||
let indexes = DocIndexes::from_path(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_bytes(map: Vec<u8>, indexes: Vec<u8>) -> Result<Self, Box<Error>> {
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
|
||||
pub fn from_raw(map: Map, indexes: DocIndexes) -> Self {
|
||||
PositiveBlob { map, indexes }
|
||||
}
|
||||
|
||||
pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<&[DocIndex]> {
|
||||
self.map.get(key).map(|index| &self.indexes[index as usize])
|
||||
}
|
||||
|
||||
pub fn as_map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn as_indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn explode(self) -> (Map, DocIndexes) {
|
||||
(self.map, self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for PositiveBlob {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
write!(f, "PositiveBlob([")?;
|
||||
let mut stream = self.into_stream();
|
||||
let mut first = true;
|
||||
while let Some((k, v)) = stream.next() {
|
||||
if !first {
|
||||
write!(f, ", ")?;
|
||||
}
|
||||
first = false;
|
||||
write!(f, "({}, {:?})", String::from_utf8_lossy(k), v)?;
|
||||
}
|
||||
write!(f, "])")
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m PositiveBlob {
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
/// The type of the stream to be constructed.
|
||||
type Into = PositiveBlobStream<'m>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into {
|
||||
PositiveBlobStream {
|
||||
map_stream: self.map.into_stream(),
|
||||
doc_indexes: &self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBlobStream<'m> {
|
||||
map_stream: map::Stream<'m>,
|
||||
doc_indexes: &'m DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'a> Streamer<'a> for PositiveBlobStream<'m> {
|
||||
type Item = (&'a [u8], &'a [DocIndex]);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.map_stream.next() {
|
||||
Some((input, index)) => {
|
||||
let doc_indexes = &self.doc_indexes[index as usize];
|
||||
Some((input, doc_indexes))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Serialize for PositiveBlob {
|
||||
fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
|
||||
let mut tuple = serializer.serialize_tuple(2)?;
|
||||
tuple.serialize_element(&self.map.as_fst().to_vec())?;
|
||||
tuple.serialize_element(&self.indexes.to_vec())?;
|
||||
tuple.end()
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for PositiveBlob {
|
||||
fn deserialize<D: Deserializer<'de>>(deserializer: D) -> Result<PositiveBlob, D::Error> {
|
||||
struct TupleVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for TupleVisitor {
|
||||
type Value = PositiveBlob;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
formatter.write_str("a PositiveBlob struct")
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn visit_seq<A: SeqAccess<'de>>(self, mut seq: A) -> Result<Self::Value, A::Error> {
|
||||
let map = match seq.next_element()? {
|
||||
Some(bytes) => match Map::from_bytes(bytes) {
|
||||
Ok(value) => value,
|
||||
Err(err) => return Err(de::Error::custom(err)),
|
||||
},
|
||||
None => return Err(de::Error::invalid_length(0, &self)),
|
||||
};
|
||||
|
||||
let indexes = match seq.next_element()? {
|
||||
Some(bytes) => match DocIndexes::from_bytes(bytes) {
|
||||
Ok(value) => value,
|
||||
Err(err) => return Err(de::Error::custom(err)),
|
||||
},
|
||||
None => return Err(de::Error::invalid_length(1, &self)),
|
||||
};
|
||||
|
||||
Ok(PositiveBlob { map, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
deserializer.deserialize_tuple(2, TupleVisitor)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBlobBuilder<W, X> {
|
||||
map: fst::MapBuilder<W>,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl PositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
PositiveBlobBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBlobBuilder<W, X> {
|
||||
pub fn new(map: W, indexes: X) -> Result<Self, Box<Error>> {
|
||||
Ok(PositiveBlobBuilder {
|
||||
map: fst::MapBuilder::new(map)?,
|
||||
indexes: DocIndexesBuilder::new(indexes),
|
||||
value: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, doc_indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(doc_indexes)?;
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::error::Error;
|
||||
use crate::{Attribute, WordArea};
|
||||
|
||||
use crate::DocumentId;
|
||||
|
||||
#[test]
|
||||
fn serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: Attribute::new_faillible(3, 11),
|
||||
word_area: WordArea::new_faillible(30, 4)
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: Attribute::new_faillible(4, 21),
|
||||
word_area: WordArea::new_faillible(35, 6)
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: Attribute::new_faillible(8, 2),
|
||||
word_area: WordArea::new_faillible(89, 6)
|
||||
};
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
builder.insert("aaa", Set::new(&[a])?)?;
|
||||
builder.insert("aab", Set::new(&[a, b, c])?)?;
|
||||
builder.insert("aac", Set::new(&[a, c])?)?;
|
||||
|
||||
let (map_bytes, indexes_bytes) = builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
|
||||
|
||||
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
|
||||
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
|
||||
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
|
||||
assert_eq!(positive_blob.get("aad"), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn serde_serialize_deserialize() -> Result<(), Box<Error>> {
|
||||
let a = DocIndex {
|
||||
document_id: DocumentId(0),
|
||||
attribute: Attribute::new_faillible(3, 11),
|
||||
word_area: WordArea::new_faillible(30, 4)
|
||||
};
|
||||
let b = DocIndex {
|
||||
document_id: DocumentId(1),
|
||||
attribute: Attribute::new_faillible(4, 21),
|
||||
word_area: WordArea::new_faillible(35, 6)
|
||||
};
|
||||
let c = DocIndex {
|
||||
document_id: DocumentId(2),
|
||||
attribute: Attribute::new_faillible(8, 2),
|
||||
word_area: WordArea::new_faillible(89, 6)
|
||||
};
|
||||
|
||||
let mut builder = PositiveBlobBuilder::memory();
|
||||
|
||||
builder.insert("aaa", Set::new(&[a])?)?;
|
||||
builder.insert("aab", Set::new(&[a, b, c])?)?;
|
||||
builder.insert("aac", Set::new(&[a, c])?)?;
|
||||
|
||||
let (map_bytes, indexes_bytes) = builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(map_bytes, indexes_bytes)?;
|
||||
|
||||
let bytes = bincode::serialize(&positive_blob)?;
|
||||
let positive_blob: PositiveBlob = bincode::deserialize(&bytes)?;
|
||||
|
||||
assert_eq!(positive_blob.get("aaa"), Some(&[a][..]));
|
||||
assert_eq!(positive_blob.get("aab"), Some(&[a, b, c][..]));
|
||||
assert_eq!(positive_blob.get("aac"), Some(&[a, c][..]));
|
||||
assert_eq!(positive_blob.get("aad"), None);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
mod blob;
|
||||
mod ops;
|
||||
|
||||
pub use self::blob::{PositiveBlob, PositiveBlobBuilder};
|
||||
pub use self::ops::OpBuilder;
|
@ -1,128 +0,0 @@
|
||||
use sdset::multi::OpBuilder as SdOpBuilder;
|
||||
use sdset::{SetOperation, Set};
|
||||
|
||||
use crate::database::blob::PositiveBlob;
|
||||
use crate::data::DocIndexes;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct OpBuilder<'m> {
|
||||
// the operation on the maps is always an union.
|
||||
map_op: fst::map::OpBuilder<'m>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
}
|
||||
|
||||
/// Do a set operation on multiple positive blobs.
|
||||
impl<'m> OpBuilder<'m> {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
map_op: fst::map::OpBuilder::new(),
|
||||
indexes: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn with_capacity(cap: usize) -> Self {
|
||||
Self {
|
||||
map_op: fst::map::OpBuilder::new(), // TODO patch fst to add with_capacity
|
||||
indexes: Vec::with_capacity(cap),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add(mut self, blob: &'m PositiveBlob) -> Self {
|
||||
self.push(blob);
|
||||
self
|
||||
}
|
||||
|
||||
pub fn push(&mut self, blob: &'m PositiveBlob) {
|
||||
self.map_op.push(blob.as_map());
|
||||
self.indexes.push(blob.as_indexes());
|
||||
}
|
||||
|
||||
pub fn union(self) -> Union<'m> {
|
||||
Union::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn intersection(self) -> Intersection<'m> {
|
||||
Intersection::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn difference(self) -> Difference<'m> {
|
||||
Difference::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
|
||||
pub fn symmetric_difference(self) -> SymmetricDifference<'m> {
|
||||
SymmetricDifference::new(self.map_op.union(), self.indexes)
|
||||
}
|
||||
}
|
||||
|
||||
macro_rules! logical_operation {
|
||||
(struct $name:ident, $operation:ident) => {
|
||||
|
||||
pub struct $name<'m> {
|
||||
stream: fst::map::Union<'m>,
|
||||
indexes: Vec<&'m DocIndexes>,
|
||||
outs: Vec<DocIndex>,
|
||||
}
|
||||
|
||||
impl<'m> $name<'m> {
|
||||
fn new(stream: fst::map::Union<'m>, indexes: Vec<&'m DocIndexes>) -> Self {
|
||||
$name {
|
||||
stream: stream,
|
||||
indexes: indexes,
|
||||
outs: Vec::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> fst::Streamer<'a> for $name<'m> {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
// loop {
|
||||
// let (input, ivalues) = match self.stream.next() {
|
||||
// Some(value) => value,
|
||||
// None => return None,
|
||||
// };
|
||||
|
||||
// self.outs.clear();
|
||||
|
||||
// let mut builder = SdOpBuilder::with_capacity(ivalues.len());
|
||||
// for ivalue in ivalues {
|
||||
// let indexes = self.indexes[ivalue.index];
|
||||
// let indexes = indexes.get(ivalue.value).expect("BUG: could not find document indexes");
|
||||
// let set = Set::new_unchecked(indexes);
|
||||
// builder.push(set);
|
||||
// }
|
||||
|
||||
// builder.$operation().extend_vec(&mut self.outs);
|
||||
|
||||
// if self.outs.is_empty() { continue }
|
||||
// return Some((input, &self.outs))
|
||||
// }
|
||||
|
||||
// FIXME make the above code compile
|
||||
match self.stream.next() {
|
||||
Some((input, ivalues)) => {
|
||||
self.outs.clear();
|
||||
|
||||
let mut builder = SdOpBuilder::with_capacity(ivalues.len());
|
||||
for ivalue in ivalues {
|
||||
let doc_indexes = &self.indexes[ivalue.index][ivalue.value as usize];
|
||||
let set = Set::new_unchecked(doc_indexes);
|
||||
builder.push(set);
|
||||
}
|
||||
|
||||
builder.$operation().extend_vec(&mut self.outs);
|
||||
|
||||
if self.outs.is_empty() { return None }
|
||||
return Some((input, Set::new_unchecked(&self.outs)))
|
||||
},
|
||||
None => None
|
||||
}
|
||||
}
|
||||
}
|
||||
}}
|
||||
|
||||
logical_operation!(struct Union, union);
|
||||
logical_operation!(struct Intersection, intersection);
|
||||
logical_operation!(struct Difference, difference);
|
||||
logical_operation!(struct SymmetricDifference, symmetric_difference);
|
@ -7,9 +7,9 @@ use rocksdb::rocksdb::{Writable, Snapshot};
|
||||
use rocksdb::{DB, DBVector, MergeOperands};
|
||||
use crossbeam::atomic::ArcCell;
|
||||
|
||||
use crate::database::index::Index;
|
||||
use crate::database::{DatabaseView, Update, Schema};
|
||||
use crate::database::{DATA_INDEX, DATA_SCHEMA};
|
||||
use crate::database::blob::{self, Blob};
|
||||
|
||||
pub struct Database {
|
||||
// DB is under a Mutex to sync update ingestions and separate DB update locking
|
||||
@ -85,12 +85,9 @@ impl Database {
|
||||
Err(e) => return Err(e.to_string().into()),
|
||||
};
|
||||
|
||||
let move_update = update.can_be_moved();
|
||||
let path = update.into_path_buf();
|
||||
let path = path.to_string_lossy();
|
||||
|
||||
let mut options = IngestExternalFileOptions::new();
|
||||
options.move_files(move_update);
|
||||
let path = update.path().to_string_lossy();
|
||||
let options = IngestExternalFileOptions::new();
|
||||
// options.move_files(move_update);
|
||||
|
||||
let cf_handle = db.cf_handle("default").expect("\"default\" column family not found");
|
||||
db.ingest_external_file_optimized(&cf_handle, &options, &[&path])?;
|
||||
@ -124,30 +121,29 @@ impl Database {
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_indexes(key: &[u8], existing_value: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
if key != DATA_INDEX {
|
||||
panic!("The merge operator only supports \"data-index\" merging")
|
||||
}
|
||||
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
|
||||
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging");
|
||||
|
||||
let capacity = {
|
||||
let remaining = operands.size_hint().0;
|
||||
let already_exist = usize::from(existing_value.is_some());
|
||||
remaining + already_exist
|
||||
let mut index: Option<Index> = None;
|
||||
|
||||
for bytes in existing.into_iter().chain(operands) {
|
||||
let bytes_len = bytes.len();
|
||||
let bytes = Arc::new(bytes.to_vec());
|
||||
let operand = Index::from_shared_bytes(bytes, 0, bytes_len);
|
||||
let operand = operand.expect("BUG: could not deserialize index");
|
||||
|
||||
let merged = match index {
|
||||
Some(ref index) => index.merge(&operand).expect("BUG: could not merge index"),
|
||||
None => operand,
|
||||
};
|
||||
|
||||
let mut op = blob::OpBuilder::with_capacity(capacity);
|
||||
if let Some(existing_value) = existing_value {
|
||||
let blob = bincode::deserialize(existing_value).expect("BUG: could not deserialize data-index");
|
||||
op.push(Blob::Positive(blob));
|
||||
index.replace(merged);
|
||||
}
|
||||
|
||||
for bytes in operands {
|
||||
let blob = bincode::deserialize(bytes).expect("BUG: could not deserialize blob");
|
||||
op.push(blob);
|
||||
}
|
||||
|
||||
let blob = op.merge().expect("BUG: could not merge blobs");
|
||||
bincode::serialize(&blob).expect("BUG: could not serialize merged blob")
|
||||
let index = index.unwrap_or_default();
|
||||
let mut bytes = Vec::new();
|
||||
index.write_to_bytes(&mut bytes);
|
||||
bytes
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@ -158,12 +154,12 @@ mod tests {
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use tempfile::tempdir;
|
||||
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
use crate::database::update::PositiveUpdateBuilder;
|
||||
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
|
||||
use crate::database::update::UpdateBuilder;
|
||||
use crate::tokenizer::DefaultBuilder;
|
||||
|
||||
#[test]
|
||||
fn ingest_update_file() -> Result<(), Box<Error>> {
|
||||
fn ingest_one_update_file() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
@ -186,7 +182,6 @@ mod tests {
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, schema.clone())?;
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
|
||||
let update_path = dir.path().join("update.sst");
|
||||
|
||||
@ -205,16 +200,16 @@ mod tests {
|
||||
|
||||
let docid0;
|
||||
let docid1;
|
||||
let mut update = {
|
||||
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
|
||||
let update = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid0 = builder.update(&doc0).unwrap();
|
||||
docid1 = builder.update(&doc1).unwrap();
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
update.set_move(true);
|
||||
database.ingest_update_file(update)?;
|
||||
let view = database.view();
|
||||
|
||||
@ -226,4 +221,100 @@ mod tests {
|
||||
|
||||
Ok(dir.close()?)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ingest_two_update_files() -> Result<(), Box<Error>> {
|
||||
let dir = tempdir()?;
|
||||
|
||||
let rocksdb_path = dir.path().join("rocksdb.rdb");
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
struct SimpleDoc {
|
||||
id: u64,
|
||||
title: String,
|
||||
description: String,
|
||||
timestamp: u64,
|
||||
}
|
||||
|
||||
let schema = {
|
||||
let mut builder = SchemaBuilder::with_identifier("id");
|
||||
builder.new_attribute("id", STORED);
|
||||
builder.new_attribute("title", STORED | INDEXED);
|
||||
builder.new_attribute("description", STORED | INDEXED);
|
||||
builder.new_attribute("timestamp", STORED);
|
||||
builder.build()
|
||||
};
|
||||
|
||||
let database = Database::create(&rocksdb_path, schema.clone())?;
|
||||
|
||||
let doc0 = SimpleDoc {
|
||||
id: 0,
|
||||
title: String::from("I am a title"),
|
||||
description: String::from("I am a description"),
|
||||
timestamp: 1234567,
|
||||
};
|
||||
let doc1 = SimpleDoc {
|
||||
id: 1,
|
||||
title: String::from("I am the second title"),
|
||||
description: String::from("I am the second description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
let doc2 = SimpleDoc {
|
||||
id: 2,
|
||||
title: String::from("I am the third title"),
|
||||
description: String::from("I am the third description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
let doc3 = SimpleDoc {
|
||||
id: 3,
|
||||
title: String::from("I am the fourth title"),
|
||||
description: String::from("I am the fourth description"),
|
||||
timestamp: 7654321,
|
||||
};
|
||||
|
||||
let docid0;
|
||||
let docid1;
|
||||
let update1 = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = dir.path().join("update-000.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema.clone());
|
||||
|
||||
docid0 = builder.update_document(&doc0, &tokenizer_builder)?;
|
||||
docid1 = builder.update_document(&doc1, &tokenizer_builder)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
let docid2;
|
||||
let docid3;
|
||||
let update2 = {
|
||||
let tokenizer_builder = DefaultBuilder::new();
|
||||
let update_path = dir.path().join("update-001.sst");
|
||||
let mut builder = UpdateBuilder::new(update_path, schema);
|
||||
|
||||
docid2 = builder.update_document(&doc2, &tokenizer_builder)?;
|
||||
docid3 = builder.update_document(&doc3, &tokenizer_builder)?;
|
||||
|
||||
builder.build()?
|
||||
};
|
||||
|
||||
database.ingest_update_file(update1)?;
|
||||
database.ingest_update_file(update2)?;
|
||||
|
||||
let view = database.view();
|
||||
|
||||
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
|
||||
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
|
||||
|
||||
assert_eq!(doc0, de_doc0);
|
||||
assert_eq!(doc1, de_doc1);
|
||||
|
||||
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
|
||||
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
|
||||
|
||||
assert_eq!(doc2, de_doc2);
|
||||
assert_eq!(doc3, de_doc3);
|
||||
|
||||
Ok(dir.close()?)
|
||||
}
|
||||
}
|
||||
|
@ -9,9 +9,9 @@ use serde::de::DeserializeOwned;
|
||||
|
||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
||||
use crate::database::{retrieve_data_schema, retrieve_data_index};
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::deserializer::Deserializer;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::database::index::Index;
|
||||
use crate::rank::{QueryBuilder, FilterFunc};
|
||||
use crate::DocumentId;
|
||||
|
||||
@ -19,7 +19,7 @@ pub struct DatabaseView<D>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
snapshot: Snapshot<D>,
|
||||
blob: PositiveBlob,
|
||||
index: Index,
|
||||
schema: Schema,
|
||||
}
|
||||
|
||||
@ -28,16 +28,16 @@ where D: Deref<Target=DB>
|
||||
{
|
||||
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
|
||||
let schema = retrieve_data_schema(&snapshot)?;
|
||||
let blob = retrieve_data_index(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, blob, schema })
|
||||
let index = retrieve_data_index(&snapshot)?;
|
||||
Ok(DatabaseView { snapshot, index, schema })
|
||||
}
|
||||
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.schema
|
||||
}
|
||||
|
||||
pub fn blob(&self) -> &PositiveBlob {
|
||||
&self.blob
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.index
|
||||
}
|
||||
|
||||
pub fn into_snapshot(self) -> Snapshot<D> {
|
||||
|
82
src/database/index/mod.rs
Normal file
82
src/database/index/mod.rs
Normal file
@ -0,0 +1,82 @@
|
||||
mod negative;
|
||||
mod positive;
|
||||
|
||||
pub(crate) use self::negative::Negative;
|
||||
pub(crate) use self::positive::{Positive, PositiveBuilder};
|
||||
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::sync::Arc;
|
||||
|
||||
use fst::{IntoStreamer, Streamer};
|
||||
use sdset::duo::DifferenceByKey;
|
||||
use sdset::{Set, SetOperation};
|
||||
use fst::Map;
|
||||
|
||||
use crate::data::{SharedData, DocIndexes};
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Index {
|
||||
pub(crate) negative: Negative,
|
||||
pub(crate) positive: Positive,
|
||||
}
|
||||
|
||||
impl Index {
|
||||
pub fn from_bytes(bytes: Vec<u8>) -> Result<Index, Box<Error>> {
|
||||
let len = bytes.len();
|
||||
Index::from_shared_bytes(Arc::new(bytes), 0, len)
|
||||
}
|
||||
|
||||
pub fn from_shared_bytes(
|
||||
bytes: Arc<Vec<u8>>,
|
||||
offset: usize,
|
||||
len: usize,
|
||||
) -> Result<Index, Box<Error>>
|
||||
{
|
||||
let data = SharedData::new(bytes, offset, len);
|
||||
let mut cursor = Cursor::new(data);
|
||||
|
||||
let negative = Negative::from_cursor(&mut cursor)?;
|
||||
let positive = Positive::from_cursor(&mut cursor)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
self.negative.write_to_bytes(bytes);
|
||||
self.positive.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn merge(&self, other: &Index) -> Result<Index, Box<Error>> {
|
||||
if other.negative.is_empty() {
|
||||
let negative = Negative::default();
|
||||
let positive = self.positive.union(&other.positive)?;
|
||||
return Ok(Index { negative, positive })
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = self.positive.into_stream();
|
||||
while let Some((key, indexes)) = stream.next() {
|
||||
let op = DifferenceByKey::new(indexes, &other.negative, |x| x.document_id, |x| *x);
|
||||
|
||||
buffer.clear();
|
||||
op.extend_vec(&mut buffer);
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
}
|
||||
|
||||
let positive = {
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
let negative = Negative::default();
|
||||
let positive = positive.union(&other.positive)?;
|
||||
Ok(Index { negative, positive })
|
||||
}
|
||||
}
|
43
src/database/index/negative.rs
Normal file
43
src/database/index/negative.rs
Normal file
@ -0,0 +1,43 @@
|
||||
use std::error::Error;
|
||||
use std::io::Cursor;
|
||||
use std::ops::Deref;
|
||||
|
||||
use sdset::Set;
|
||||
use byteorder::{LittleEndian, WriteBytesExt};
|
||||
|
||||
use crate::data::SharedData;
|
||||
use crate::data::DocIds;
|
||||
use crate::DocumentId;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Negative(DocIds);
|
||||
|
||||
impl Negative {
|
||||
pub fn new(doc_ids: DocIds) -> Negative {
|
||||
Negative(doc_ids)
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Negative, Box<Error>> {
|
||||
let doc_ids = DocIds::from_cursor(cursor)?;
|
||||
Ok(Negative(doc_ids))
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.0.as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
}
|
||||
|
||||
pub fn is_empty(&self) -> bool {
|
||||
self.0.is_empty()
|
||||
}
|
||||
}
|
||||
|
||||
impl Deref for Negative {
|
||||
type Target = Set<DocumentId>;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
self.0.as_ref()
|
||||
}
|
||||
}
|
166
src/database/index/positive.rs
Normal file
166
src/database/index/positive.rs
Normal file
@ -0,0 +1,166 @@
|
||||
use std::io::{Write, BufRead, Cursor};
|
||||
use std::error::Error;
|
||||
|
||||
use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
|
||||
use fst::{map, Map, Streamer, IntoStreamer};
|
||||
use sdset::{Set, SetOperation};
|
||||
use sdset::duo::Union;
|
||||
use fst::raw::Fst;
|
||||
|
||||
use crate::data::{DocIndexes, DocIndexesBuilder};
|
||||
use crate::data::SharedData;
|
||||
use crate::DocIndex;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Positive {
|
||||
map: Map,
|
||||
indexes: DocIndexes,
|
||||
}
|
||||
|
||||
impl Positive {
|
||||
pub fn new(map: Map, indexes: DocIndexes) -> Positive {
|
||||
Positive { map, indexes }
|
||||
}
|
||||
|
||||
pub fn from_cursor(cursor: &mut Cursor<SharedData>) -> Result<Positive, Box<Error>> {
|
||||
let len = cursor.read_u64::<LittleEndian>()? as usize;
|
||||
let offset = cursor.position() as usize;
|
||||
let data = cursor.get_ref().range(offset, len);
|
||||
|
||||
let fst = Fst::from_shared_bytes(data.bytes, data.offset, data.len)?;
|
||||
let map = Map::from(fst);
|
||||
cursor.consume(len);
|
||||
|
||||
let indexes = DocIndexes::from_cursor(cursor)?;
|
||||
|
||||
Ok(Positive { map, indexes})
|
||||
}
|
||||
|
||||
pub fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
|
||||
let slice = self.map.as_fst().as_bytes();
|
||||
let len = slice.len() as u64;
|
||||
let _ = bytes.write_u64::<LittleEndian>(len);
|
||||
bytes.extend_from_slice(slice);
|
||||
|
||||
self.indexes.write_to_bytes(bytes);
|
||||
}
|
||||
|
||||
pub fn map(&self) -> &Map {
|
||||
&self.map
|
||||
}
|
||||
|
||||
pub fn indexes(&self) -> &DocIndexes {
|
||||
&self.indexes
|
||||
}
|
||||
|
||||
pub fn union(&self, other: &Positive) -> Result<Positive, Box<Error>> {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
let mut stream = map::OpBuilder::new().add(&self.map).add(&other.map).union();
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
while let Some((key, ivalues)) = stream.next() {
|
||||
buffer.clear();
|
||||
match ivalues {
|
||||
[a, b] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
let a = Set::new_unchecked(indexes);
|
||||
|
||||
let indexes = if b.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(b.value as usize).ok_or(format!("index not found"))?;
|
||||
let b = Set::new_unchecked(indexes);
|
||||
|
||||
let op = Union::new(a, b);
|
||||
op.extend_vec(&mut buffer);
|
||||
},
|
||||
[a] => {
|
||||
let indexes = if a.index == 0 { &self.indexes } else { &other.indexes };
|
||||
let indexes = indexes.get(a.value as usize).ok_or(format!("index not found"))?;
|
||||
buffer.extend_from_slice(indexes)
|
||||
},
|
||||
_ => continue,
|
||||
}
|
||||
|
||||
if !buffer.is_empty() {
|
||||
let indexes = Set::new_unchecked(&buffer);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Ok(Positive { map, indexes })
|
||||
}
|
||||
}
|
||||
|
||||
impl<'m, 'a> IntoStreamer<'a> for &'m Positive {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
/// The type of the stream to be constructed.
|
||||
type Into = Stream<'m>;
|
||||
|
||||
/// Construct a stream from `Self`.
|
||||
fn into_stream(self) -> Self::Into {
|
||||
Stream {
|
||||
map_stream: self.map.into_stream(),
|
||||
indexes: &self.indexes,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Stream<'m> {
|
||||
map_stream: map::Stream<'m>,
|
||||
indexes: &'m DocIndexes,
|
||||
}
|
||||
|
||||
impl<'m, 'a> Streamer<'a> for Stream<'m> {
|
||||
type Item = (&'a [u8], &'a Set<DocIndex>);
|
||||
|
||||
fn next(&'a mut self) -> Option<Self::Item> {
|
||||
match self.map_stream.next() {
|
||||
Some((input, index)) => {
|
||||
let indexes = &self.indexes[index as usize];
|
||||
let indexes = Set::new_unchecked(indexes);
|
||||
Some((input, indexes))
|
||||
},
|
||||
None => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PositiveBuilder<W, X> {
|
||||
map: fst::MapBuilder<W>,
|
||||
indexes: DocIndexesBuilder<X>,
|
||||
value: u64,
|
||||
}
|
||||
|
||||
impl PositiveBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
PositiveBuilder {
|
||||
map: fst::MapBuilder::memory(),
|
||||
indexes: DocIndexesBuilder::memory(),
|
||||
value: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> PositiveBuilder<W, X> {
|
||||
/// If a key is inserted that is less than or equal to any previous key added,
|
||||
/// then an error is returned. Similarly, if there was a problem writing
|
||||
/// to the underlying writer, an error is returned.
|
||||
// FIXME what if one write doesn't work but the other do ?
|
||||
pub fn insert<K>(&mut self, key: K, indexes: &Set<DocIndex>) -> Result<(), Box<Error>>
|
||||
where K: AsRef<[u8]>,
|
||||
{
|
||||
self.map.insert(key, self.value)?;
|
||||
self.indexes.insert(indexes);
|
||||
self.value += 1;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn into_inner(self) -> Result<(W, X), Box<Error>> {
|
||||
let map = self.map.into_inner()?;
|
||||
let indexes = self.indexes.into_inner()?;
|
||||
Ok((map, indexes))
|
||||
}
|
||||
}
|
@ -1,47 +1,29 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::error::Error;
|
||||
use std::ops::Deref;
|
||||
use std::sync::Arc;
|
||||
|
||||
use rocksdb::rocksdb::{DB, Snapshot};
|
||||
|
||||
pub use self::update::{
|
||||
Update, PositiveUpdateBuilder, NewState,
|
||||
SerializerError, NegativeUpdateBuilder
|
||||
};
|
||||
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
|
||||
pub use self::database_view::{DatabaseView, DocumentIter};
|
||||
pub use self::update::{Update, UpdateBuilder};
|
||||
pub use self::serde::SerializerError;
|
||||
pub use self::database::Database;
|
||||
pub use self::schema::Schema;
|
||||
use self::blob::positive::PositiveBlob;
|
||||
pub use self::index::Index;
|
||||
|
||||
const DATA_INDEX: &[u8] = b"data-index";
|
||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||
|
||||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
pub mod blob;
|
||||
pub mod schema;
|
||||
pub(crate) mod index;
|
||||
mod update;
|
||||
mod serde;
|
||||
mod database;
|
||||
mod document_key;
|
||||
mod database_view;
|
||||
mod deserializer;
|
||||
|
||||
fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s = DefaultHasher::new();
|
||||
t.hash(&mut s);
|
||||
s.finish()
|
||||
}
|
||||
|
||||
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
@ -51,11 +33,15 @@ where D: Deref<Target=DB>
|
||||
}
|
||||
}
|
||||
|
||||
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
|
||||
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
|
||||
where D: Deref<Target=DB>
|
||||
{
|
||||
match snapshot.get(DATA_INDEX)? {
|
||||
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||
None => Ok(PositiveBlob::default()),
|
||||
Some(vector) => {
|
||||
let bytes_len = vector.as_ref().len();
|
||||
let bytes = Arc::new(vector.as_ref().to_vec());
|
||||
Ok(Index::from_shared_bytes(bytes, 0, bytes_len)?)
|
||||
},
|
||||
None => Ok(Index::default()),
|
||||
}
|
||||
}
|
||||
|
@ -1,6 +1,4 @@
|
||||
use crate::database::update::SerializerError;
|
||||
use std::collections::{HashMap, BTreeMap};
|
||||
use crate::database::calculate_hash;
|
||||
use std::io::{Read, Write};
|
||||
use std::error::Error;
|
||||
use std::{fmt, u16};
|
||||
@ -8,9 +6,11 @@ use std::ops::BitOr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use serde_derive::{Serialize, Deserialize};
|
||||
use serde::ser::{self, Serialize};
|
||||
use linked_hash_map::LinkedHashMap;
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::find_id::FindDocumentIdSerializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false };
|
||||
@ -141,13 +141,12 @@ impl Schema {
|
||||
attributes
|
||||
}
|
||||
|
||||
pub fn document_id<T>(&self, document: &T) -> Result<DocumentId, SerializerError>
|
||||
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let find_document_id = FindDocumentIdSerializer {
|
||||
id_attribute_name: self.identifier_name(),
|
||||
};
|
||||
document.serialize(find_document_id)
|
||||
let id_attribute_name = &self.inner.identifier;
|
||||
let serializer = FindDocumentIdSerializer { id_attribute_name };
|
||||
document.serialize(serializer)
|
||||
}
|
||||
|
||||
pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
|
||||
@ -188,192 +187,6 @@ impl fmt::Display for SchemaAttr {
|
||||
}
|
||||
}
|
||||
|
||||
struct FindDocumentIdSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
// Ok(MapSerializer {
|
||||
// schema: self.schema,
|
||||
// document_id: self.document_id,
|
||||
// new_states: self.new_states,
|
||||
// })
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(FindDocumentIdStructSerializer {
|
||||
id_attribute_name: self.id_attribute_name,
|
||||
document_id: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
struct FindDocumentIdStructSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if self.id_attribute_name == key {
|
||||
// TODO can it be possible to have multiple ids?
|
||||
let id = bincode::serialize(value).unwrap();
|
||||
let hash = calculate_hash(&id);
|
||||
self.document_id = Some(DocumentId(hash));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
match self.document_id {
|
||||
Some(document_id) => Ok(document_id),
|
||||
None => Err(SerializerError::DocumentIdNotFound)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
243
src/database/serde/find_id.rs
Normal file
243
src/database/serde/find_id.rs
Normal file
@ -0,0 +1,243 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::serde::{SerializerError, calculate_hash};
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct FindDocumentIdSerializer<'a> {
|
||||
pub id_attribute_name: &'a str,
|
||||
}
|
||||
|
||||
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = FindDocumentIdMapSerializer<'a>;
|
||||
type SerializeStruct = FindDocumentIdStructSerializer<'a>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(FindDocumentIdMapSerializer {
|
||||
id_attribute_name: self.id_attribute_name,
|
||||
document_id: None,
|
||||
current_key_name: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(FindDocumentIdStructSerializer {
|
||||
id_attribute_name: self.id_attribute_name,
|
||||
document_id: None,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FindDocumentIdMapSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
current_key_name: Option<String>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
|
||||
if self.id_attribute_name == key {
|
||||
// TODO is it possible to have multiple ids?
|
||||
let id = bincode::serialize(value).unwrap();
|
||||
let hash = calculate_hash(&id);
|
||||
self.document_id = Some(DocumentId(hash));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
match self.document_id {
|
||||
Some(document_id) => Ok(document_id),
|
||||
None => Err(SerializerError::DocumentIdNotFound)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct FindDocumentIdStructSerializer<'a> {
|
||||
id_attribute_name: &'a str,
|
||||
document_id: Option<DocumentId>,
|
||||
}
|
||||
|
||||
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
|
||||
type Ok = DocumentId;
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if self.id_attribute_name == key {
|
||||
// TODO can it be possible to have multiple ids?
|
||||
let id = bincode::serialize(value).unwrap();
|
||||
let hash = calculate_hash(&id);
|
||||
self.document_id = Some(DocumentId(hash));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
match self.document_id {
|
||||
Some(document_id) => Ok(document_id),
|
||||
None => Err(SerializerError::DocumentIdNotFound)
|
||||
}
|
||||
}
|
||||
}
|
188
src/database/serde/indexer_serializer.rs
Normal file
188
src/database/serde/indexer_serializer.rs
Normal file
@ -0,0 +1,188 @@
|
||||
use crate::database::update::RawUpdateBuilder;
|
||||
use crate::database::schema::SchemaAttr;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::tokenizer::Token;
|
||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
pub struct IndexerSerializer<'a, B> {
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
pub document_id: DocumentId,
|
||||
pub attribute: SchemaAttr,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
||||
// FIXME must u32::try_from instead
|
||||
let attribute = match Attribute::new(self.attribute.0, word_index as u32) {
|
||||
Ok(attribute) => attribute,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
|
||||
// FIXME must u16/u32::try_from instead
|
||||
let word_area = match WordArea::new(char_index as u32, word.len() as u16) {
|
||||
Ok(word_area) => word_area,
|
||||
Err(_) => return Ok(()),
|
||||
};
|
||||
|
||||
let doc_index = DocIndex {
|
||||
document_id: self.document_id,
|
||||
attribute,
|
||||
word_area
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
self.builder.insert_doc_index(word_unidecoded.into_bytes(), doc_index);
|
||||
}
|
||||
|
||||
self.builder.insert_doc_index(word_lower.into_bytes(), doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "seq" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
146
src/database/serde/key_to_string.rs
Normal file
146
src/database/serde/key_to_string.rs
Normal file
@ -0,0 +1,146 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::SerializerError;
|
||||
|
||||
pub struct KeyToStringSerializer;
|
||||
|
||||
impl ser::Serializer for KeyToStringSerializer {
|
||||
type Ok = String;
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(value.to_string())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
57
src/database/serde/mod.rs
Normal file
57
src/database/serde/mod.rs
Normal file
@ -0,0 +1,57 @@
|
||||
use std::collections::hash_map::DefaultHasher;
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use serde::ser;
|
||||
|
||||
macro_rules! forward_to_unserializable_type {
|
||||
($($ty:ident => $se_method:ident,)*) => {
|
||||
$(
|
||||
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "$ty" })
|
||||
}
|
||||
)*
|
||||
}
|
||||
}
|
||||
|
||||
pub mod find_id;
|
||||
pub mod key_to_string;
|
||||
pub mod serializer;
|
||||
pub mod indexer_serializer;
|
||||
|
||||
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
|
||||
let mut s = DefaultHasher::new();
|
||||
t.hash(&mut s);
|
||||
s.finish()
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
DocumentIdNotFound,
|
||||
UnserializableType { name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::DocumentIdNotFound => {
|
||||
write!(f, "serialized document does not have an id according to the schema")
|
||||
}
|
||||
SerializerError::UnserializableType { name } => {
|
||||
write!(f, "Only struct and map types are considered valid documents and
|
||||
can be serialized, not {} types directly.", name)
|
||||
},
|
||||
SerializerError::Custom(s) => f.write_str(&s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
281
src/database/serde/serializer.rs
Normal file
281
src/database/serde/serializer.rs
Normal file
@ -0,0 +1,281 @@
|
||||
use serde::Serialize;
|
||||
use serde::ser;
|
||||
|
||||
use crate::database::serde::indexer_serializer::IndexerSerializer;
|
||||
use crate::database::serde::key_to_string::KeyToStringSerializer;
|
||||
use crate::database::document_key::DocumentKeyAttr;
|
||||
use crate::database::update::RawUpdateBuilder;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::schema::Schema;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct Serializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = MapSerializer<'a, B>;
|
||||
type SerializeStruct = StructSerializer<'a, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Ok(MapSerializer {
|
||||
schema: self.schema,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
current_key_name: None,
|
||||
builder: self.builder,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
builder: self.builder,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
pub struct MapSerializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub document_id: DocumentId,
|
||||
pub current_key_name: Option<String>,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeMap for MapSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
self.current_key_name = Some(key);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
let key = self.current_key_name.take().unwrap();
|
||||
self.serialize_entry(&key, value)
|
||||
}
|
||||
|
||||
fn serialize_entry<K: ?Sized, V: ?Sized>(
|
||||
&mut self,
|
||||
key: &K,
|
||||
value: &V
|
||||
) -> Result<(), Self::Error>
|
||||
where K: Serialize, V: Serialize,
|
||||
{
|
||||
let key = key.serialize(KeyToStringSerializer)?;
|
||||
|
||||
if let Some(attr) = self.schema.attribute(key) {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.builder.insert_attribute_value(key, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
builder: self.builder,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct StructSerializer<'a, B> {
|
||||
pub schema: &'a Schema,
|
||||
pub document_id: DocumentId,
|
||||
pub tokenizer_builder: &'a B,
|
||||
pub builder: &'a mut RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if let Some(attr) = self.schema.attribute(key) {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.builder.insert_attribute_value(key, value);
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
builder: self.builder,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
60
src/database/update/builder.rs
Normal file
60
src/database/update/builder.rs
Normal file
@ -0,0 +1,60 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::database::serde::serializer::Serializer;
|
||||
use crate::database::serde::SerializerError;
|
||||
use crate::tokenizer::TokenizerBuilder;
|
||||
use crate::database::Schema;
|
||||
|
||||
use crate::DocumentId;
|
||||
use super::{Update, RawUpdateBuilder};
|
||||
|
||||
pub struct UpdateBuilder {
|
||||
schema: Schema,
|
||||
raw_builder: RawUpdateBuilder,
|
||||
}
|
||||
|
||||
impl UpdateBuilder {
|
||||
pub fn new(path: PathBuf, schema: Schema) -> UpdateBuilder {
|
||||
UpdateBuilder {
|
||||
schema: schema,
|
||||
raw_builder: RawUpdateBuilder::new(path),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update_document<T, B>(
|
||||
&mut self,
|
||||
document: T,
|
||||
tokenizer_builder: &B,
|
||||
) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
B: TokenizerBuilder,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
document_id: document_id,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
builder: &mut self.raw_builder,
|
||||
};
|
||||
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
|
||||
where T: Serialize,
|
||||
{
|
||||
let document_id = self.schema.document_id(&document)?;
|
||||
self.raw_builder.remove_document(document_id);
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
self.raw_builder.build()
|
||||
}
|
||||
}
|
@ -1,35 +1,17 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
mod negative;
|
||||
mod positive;
|
||||
mod builder;
|
||||
mod raw_builder;
|
||||
|
||||
pub use self::positive::{PositiveUpdateBuilder, NewState, SerializerError};
|
||||
pub use self::negative::NegativeUpdateBuilder;
|
||||
pub use self::builder::UpdateBuilder;
|
||||
pub use self::raw_builder::RawUpdateBuilder;
|
||||
|
||||
pub struct Update {
|
||||
path: PathBuf,
|
||||
can_be_moved: bool,
|
||||
sst_file: PathBuf,
|
||||
}
|
||||
|
||||
impl Update {
|
||||
pub fn open<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
|
||||
Ok(Update { path: path.into(), can_be_moved: false })
|
||||
}
|
||||
|
||||
pub fn open_and_move<P: Into<PathBuf>>(path: P) -> Result<Update, Box<Error>> {
|
||||
Ok(Update { path: path.into(), can_be_moved: true })
|
||||
}
|
||||
|
||||
pub fn set_move(&mut self, can_be_moved: bool) {
|
||||
self.can_be_moved = can_be_moved
|
||||
}
|
||||
|
||||
pub fn can_be_moved(&self) -> bool {
|
||||
self.can_be_moved
|
||||
}
|
||||
|
||||
pub fn into_path_buf(self) -> PathBuf {
|
||||
self.path
|
||||
pub fn path(&self) -> &Path {
|
||||
&self.sst_file
|
||||
}
|
||||
}
|
||||
|
@ -1,4 +0,0 @@
|
||||
mod update;
|
||||
mod unordered_builder;
|
||||
|
||||
pub use self::update::NegativeUpdateBuilder;
|
@ -1,37 +0,0 @@
|
||||
use std::collections::BTreeSet;
|
||||
use std::io;
|
||||
|
||||
use byteorder::{NativeEndian, WriteBytesExt};
|
||||
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct UnorderedNegativeBlobBuilder<W> {
|
||||
doc_ids: BTreeSet<DocumentId>, // TODO: prefer a linked-list
|
||||
wrt: W,
|
||||
}
|
||||
|
||||
impl UnorderedNegativeBlobBuilder<Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
UnorderedNegativeBlobBuilder::new(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: io::Write> UnorderedNegativeBlobBuilder<W> {
|
||||
pub fn new(wrt: W) -> Self {
|
||||
Self {
|
||||
doc_ids: BTreeSet::new(),
|
||||
wrt: wrt,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert(&mut self, doc: DocumentId) -> bool {
|
||||
self.doc_ids.insert(doc)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> io::Result<W> {
|
||||
for id in self.doc_ids {
|
||||
self.wrt.write_u64::<NativeEndian>(id.0)?;
|
||||
}
|
||||
Ok(self.wrt)
|
||||
}
|
||||
}
|
@ -1,60 +0,0 @@
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use ::rocksdb::rocksdb_options;
|
||||
|
||||
use crate::database::update::negative::unordered_builder::UnorderedNegativeBlobBuilder;
|
||||
use crate::database::blob::{Blob, NegativeBlob};
|
||||
use crate::database::update::Update;
|
||||
use crate::database::DocumentKey;
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::DocumentId;
|
||||
|
||||
pub struct NegativeUpdateBuilder {
|
||||
path: PathBuf,
|
||||
doc_ids: UnorderedNegativeBlobBuilder<Vec<u8>>,
|
||||
}
|
||||
|
||||
impl NegativeUpdateBuilder {
|
||||
pub fn new<P: Into<PathBuf>>(path: P) -> NegativeUpdateBuilder {
|
||||
NegativeUpdateBuilder {
|
||||
path: path.into(),
|
||||
doc_ids: UnorderedNegativeBlobBuilder::memory(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn remove(&mut self, id: DocumentId) -> bool {
|
||||
self.doc_ids.insert(id)
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.path.to_string_lossy())?;
|
||||
|
||||
let bytes = self.doc_ids.into_inner()?;
|
||||
let negative_blob = NegativeBlob::from_bytes(bytes)?;
|
||||
let blob = Blob::Negative(negative_blob);
|
||||
|
||||
// write the data-index aka negative blob
|
||||
let bytes = bincode::serialize(&blob)?;
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// FIXME remove this ugly thing !
|
||||
// let Blob::Negative(negative_blob) = blob;
|
||||
let negative_blob = match blob {
|
||||
Blob::Negative(blob) => blob,
|
||||
Blob::Positive(_) => unreachable!(),
|
||||
};
|
||||
|
||||
for &document_id in negative_blob.as_ref().as_slice() {
|
||||
let start = DocumentKey::new(document_id);
|
||||
let end = start.with_attribute_max();
|
||||
file_writer.delete_range(start.as_ref(), end.as_ref())?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
Update::open(self.path)
|
||||
}
|
||||
}
|
@ -1,4 +0,0 @@
|
||||
mod update;
|
||||
mod unordered_builder;
|
||||
|
||||
pub use self::update::{PositiveUpdateBuilder, NewState, SerializerError};
|
@ -1,49 +0,0 @@
|
||||
#![allow(unused)]
|
||||
|
||||
use std::collections::BTreeMap;
|
||||
use std::error::Error;
|
||||
use std::io::Write;
|
||||
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::blob::positive::PositiveBlobBuilder;
|
||||
use crate::DocIndex;
|
||||
|
||||
pub struct UnorderedPositiveBlobBuilder<W, X> {
|
||||
builder: PositiveBlobBuilder<W, X>,
|
||||
map: BTreeMap<Vec<u8>, Vec<DocIndex>>,
|
||||
}
|
||||
|
||||
impl UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>> {
|
||||
pub fn memory() -> Self {
|
||||
Self {
|
||||
builder: PositiveBlobBuilder::memory(),
|
||||
map: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<W: Write, X: Write> UnorderedPositiveBlobBuilder<W, X> {
|
||||
pub fn new(map_wtr: W, doc_wtr: X) -> Result<Self, Box<Error>> {
|
||||
Ok(UnorderedPositiveBlobBuilder {
|
||||
builder: PositiveBlobBuilder::new(map_wtr, doc_wtr)?,
|
||||
map: BTreeMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
pub fn insert<K: Into<Vec<u8>>>(&mut self, input: K, doc_index: DocIndex) {
|
||||
self.map.entry(input.into()).or_insert_with(Vec::new).push(doc_index);
|
||||
}
|
||||
|
||||
pub fn finish(self) -> Result<(), Box<Error>> {
|
||||
self.into_inner().map(drop)
|
||||
}
|
||||
|
||||
pub fn into_inner(mut self) -> Result<(W, X), Box<Error>> {
|
||||
for (key, mut doc_indexes) in self.map {
|
||||
doc_indexes.sort_unstable();
|
||||
self.builder.insert(&key, Set::new_unchecked(&doc_indexes))?;
|
||||
}
|
||||
self.builder.into_inner()
|
||||
}
|
||||
}
|
@ -1,504 +0,0 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
use std::fmt;
|
||||
|
||||
use ::rocksdb::rocksdb_options;
|
||||
use serde::ser::{self, Serialize};
|
||||
|
||||
use crate::database::update::positive::unordered_builder::UnorderedPositiveBlobBuilder;
|
||||
use crate::database::blob::positive::PositiveBlob;
|
||||
use crate::database::schema::{Schema, SchemaAttr};
|
||||
use crate::tokenizer::{TokenizerBuilder, Token};
|
||||
use crate::database::DocumentKeyAttr;
|
||||
use crate::database::update::Update;
|
||||
use crate::database::DATA_INDEX;
|
||||
use crate::database::blob::Blob;
|
||||
use crate::{DocumentId, DocIndex, Attribute, WordArea};
|
||||
|
||||
pub enum NewState {
|
||||
Updated { value: Vec<u8> },
|
||||
Removed,
|
||||
}
|
||||
|
||||
pub struct PositiveUpdateBuilder<B> {
|
||||
path: PathBuf,
|
||||
schema: Schema,
|
||||
tokenizer_builder: B,
|
||||
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B> {
|
||||
pub fn new<P: Into<PathBuf>>(path: P, schema: Schema, tokenizer_builder: B) -> PositiveUpdateBuilder<B> {
|
||||
PositiveUpdateBuilder {
|
||||
path: path.into(),
|
||||
schema: schema,
|
||||
tokenizer_builder: tokenizer_builder,
|
||||
builder: UnorderedPositiveBlobBuilder::memory(),
|
||||
new_states: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn update<T: Serialize>(&mut self, document: &T) -> Result<DocumentId, SerializerError>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
let document_id = self.schema.document_id(document)?;
|
||||
|
||||
let serializer = Serializer {
|
||||
schema: &self.schema,
|
||||
tokenizer_builder: &self.tokenizer_builder,
|
||||
document_id: document_id,
|
||||
builder: &mut self.builder,
|
||||
new_states: &mut self.new_states
|
||||
};
|
||||
document.serialize(serializer)?;
|
||||
|
||||
Ok(document_id)
|
||||
}
|
||||
|
||||
// TODO value must be a field that can be indexed
|
||||
pub fn update_field(&mut self, id: DocumentId, attr: SchemaAttr, value: String) {
|
||||
let value = bincode::serialize(&value).unwrap();
|
||||
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Updated { value });
|
||||
}
|
||||
|
||||
pub fn remove_field(&mut self, id: DocumentId, attr: SchemaAttr) {
|
||||
self.new_states.insert(DocumentKeyAttr::new(id, attr), NewState::Removed);
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
pub enum SerializerError {
|
||||
DocumentIdNotFound,
|
||||
UnserializableType { name: &'static str },
|
||||
Custom(String),
|
||||
}
|
||||
|
||||
impl ser::Error for SerializerError {
|
||||
fn custom<T: fmt::Display>(msg: T) -> Self {
|
||||
SerializerError::Custom(msg.to_string())
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Display for SerializerError {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
match self {
|
||||
SerializerError::DocumentIdNotFound => {
|
||||
write!(f, "serialized document does not have an id according to the schema")
|
||||
}
|
||||
SerializerError::UnserializableType { name } => {
|
||||
write!(f, "Only struct and map types are considered valid documents and
|
||||
can be serialized, not {} types directly.", name)
|
||||
},
|
||||
SerializerError::Custom(s) => f.write_str(&s),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Error for SerializerError {}
|
||||
|
||||
struct Serializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = StructSerializer<'a, B>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "str" })
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "sequence" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
// Ok(MapSerializer {
|
||||
// schema: self.schema,
|
||||
// document_id: self.document_id,
|
||||
// new_states: self.new_states,
|
||||
// })
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Ok(StructSerializer {
|
||||
schema: self.schema,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
builder: self.builder,
|
||||
new_states: self.new_states,
|
||||
})
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
struct StructSerializer<'a, B> {
|
||||
schema: &'a Schema,
|
||||
tokenizer_builder: &'a B,
|
||||
document_id: DocumentId,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
new_states: &'a mut BTreeMap<DocumentKeyAttr, NewState>,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
|
||||
fn serialize_field<T: ?Sized>(
|
||||
&mut self,
|
||||
key: &'static str,
|
||||
value: &T
|
||||
) -> Result<(), Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
if let Some(attr) = self.schema.attribute(key) {
|
||||
let props = self.schema.props(attr);
|
||||
if props.is_stored() {
|
||||
let value = bincode::serialize(value).unwrap();
|
||||
let key = DocumentKeyAttr::new(self.document_id, attr);
|
||||
self.new_states.insert(key, NewState::Updated { value });
|
||||
}
|
||||
if props.is_indexed() {
|
||||
let serializer = IndexerSerializer {
|
||||
builder: self.builder,
|
||||
tokenizer_builder: self.tokenizer_builder,
|
||||
document_id: self.document_id,
|
||||
attribute: attr,
|
||||
};
|
||||
value.serialize(serializer)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
struct IndexerSerializer<'a, B> {
|
||||
tokenizer_builder: &'a B,
|
||||
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||
document_id: DocumentId,
|
||||
attribute: SchemaAttr,
|
||||
}
|
||||
|
||||
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||
where B: TokenizerBuilder
|
||||
{
|
||||
type Ok = ();
|
||||
type Error = SerializerError;
|
||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||
|
||||
forward_to_unserializable_type! {
|
||||
bool => serialize_bool,
|
||||
char => serialize_char,
|
||||
|
||||
i8 => serialize_i8,
|
||||
i16 => serialize_i16,
|
||||
i32 => serialize_i32,
|
||||
i64 => serialize_i64,
|
||||
|
||||
u8 => serialize_u8,
|
||||
u16 => serialize_u16,
|
||||
u32 => serialize_u32,
|
||||
u64 => serialize_u64,
|
||||
|
||||
f32 => serialize_f32,
|
||||
f64 => serialize_f64,
|
||||
}
|
||||
|
||||
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||
for Token { word, word_index, char_index } in self.tokenizer_builder.build(v) {
|
||||
let doc_index = DocIndex {
|
||||
document_id: self.document_id,
|
||||
attribute: Attribute::new_faillible(self.attribute.0, word_index as u32),
|
||||
word_area: WordArea::new_faillible(char_index as u32, word.len() as u16),
|
||||
};
|
||||
|
||||
// insert the exact representation
|
||||
let word_lower = word.to_lowercase();
|
||||
|
||||
// and the unidecoded lowercased version
|
||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||
if word_lower != word_unidecoded {
|
||||
self.builder.insert(word_unidecoded, doc_index);
|
||||
}
|
||||
|
||||
self.builder.insert(word_lower, doc_index);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||
}
|
||||
|
||||
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "Option" })
|
||||
}
|
||||
|
||||
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "()" })
|
||||
}
|
||||
|
||||
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||
}
|
||||
|
||||
fn serialize_unit_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||
}
|
||||
|
||||
fn serialize_newtype_struct<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
value.serialize(self)
|
||||
}
|
||||
|
||||
fn serialize_newtype_variant<T: ?Sized>(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_value: &T
|
||||
) -> Result<Self::Ok, Self::Error>
|
||||
where T: Serialize,
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||
}
|
||||
|
||||
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "seq" })
|
||||
}
|
||||
|
||||
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||
}
|
||||
|
||||
fn serialize_tuple_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||
}
|
||||
|
||||
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||
Err(SerializerError::UnserializableType { name: "map" })
|
||||
}
|
||||
|
||||
fn serialize_struct(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStruct, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct" })
|
||||
}
|
||||
|
||||
fn serialize_struct_variant(
|
||||
self,
|
||||
_name: &'static str,
|
||||
_variant_index: u32,
|
||||
_variant: &'static str,
|
||||
_len: usize
|
||||
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||
{
|
||||
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||
}
|
||||
}
|
||||
|
||||
impl<B> PositiveUpdateBuilder<B> {
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.path.to_string_lossy())?;
|
||||
|
||||
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
|
||||
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
|
||||
let blob = Blob::Positive(positive_blob);
|
||||
|
||||
// write the data-index aka positive blob
|
||||
let bytes = bincode::serialize(&blob)?;
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents fields updates
|
||||
for (key, state) in self.new_states {
|
||||
match state {
|
||||
NewState::Updated { value } => {
|
||||
file_writer.put(key.as_ref(), &value)?
|
||||
},
|
||||
NewState::Removed => file_writer.delete(key.as_ref())?,
|
||||
}
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
Update::open(self.path)
|
||||
}
|
||||
}
|
93
src/database/update/raw_builder.rs
Normal file
93
src/database/update/raw_builder.rs
Normal file
@ -0,0 +1,93 @@
|
||||
use std::collections::{BTreeMap, BTreeSet};
|
||||
use std::path::PathBuf;
|
||||
use std::error::Error;
|
||||
|
||||
use rocksdb::rocksdb_options;
|
||||
use fst::map::Map;
|
||||
use sdset::Set;
|
||||
|
||||
use crate::database::index::{Index, Positive, PositiveBuilder, Negative};
|
||||
use crate::database::{DATA_INDEX, DocumentKeyAttr};
|
||||
use crate::data::{DocIds, DocIndexes};
|
||||
use crate::{DocumentId, DocIndex};
|
||||
use super::Update;
|
||||
|
||||
type Token = Vec<u8>; // TODO could be replaced by a SmallVec
|
||||
type Value = Vec<u8>;
|
||||
|
||||
pub struct RawUpdateBuilder {
|
||||
sst_file: PathBuf,
|
||||
removed_documents: BTreeSet<DocumentId>,
|
||||
words_indexes: BTreeMap<Token, Vec<DocIndex>>,
|
||||
keys_values: BTreeMap<DocumentKeyAttr, Value>,
|
||||
}
|
||||
|
||||
impl RawUpdateBuilder {
|
||||
pub fn new(path: PathBuf) -> RawUpdateBuilder {
|
||||
RawUpdateBuilder {
|
||||
sst_file: path,
|
||||
removed_documents: BTreeSet::new(),
|
||||
words_indexes: BTreeMap::new(),
|
||||
keys_values: BTreeMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn insert_doc_index(&mut self, token: Vec<u8>, doc_index: DocIndex) {
|
||||
self.words_indexes.entry(token).or_insert_with(Vec::new).push(doc_index)
|
||||
}
|
||||
|
||||
pub fn insert_attribute_value(&mut self, key_attr: DocumentKeyAttr, value: Vec<u8>) -> Option<Vec<u8>> {
|
||||
self.keys_values.insert(key_attr, value)
|
||||
}
|
||||
|
||||
pub fn remove_document(&mut self, id: DocumentId) {
|
||||
self.removed_documents.insert(id);
|
||||
}
|
||||
|
||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||
let tree = {
|
||||
let negative = {
|
||||
let documents_ids: Vec<_> = self.removed_documents.into_iter().collect();
|
||||
let documents_ids = Set::new_unchecked(&documents_ids);
|
||||
let doc_ids = DocIds::new(documents_ids);
|
||||
Negative::new(doc_ids)
|
||||
};
|
||||
|
||||
let positive = {
|
||||
let mut builder = PositiveBuilder::memory();
|
||||
|
||||
for (key, mut indexes) in self.words_indexes {
|
||||
indexes.sort_unstable();
|
||||
let indexes = Set::new_unchecked(&indexes);
|
||||
builder.insert(key, indexes)?;
|
||||
}
|
||||
|
||||
let (map, indexes) = builder.into_inner()?;
|
||||
let map = Map::from_bytes(map)?;
|
||||
let indexes = DocIndexes::from_bytes(indexes)?;
|
||||
Positive::new(map, indexes)
|
||||
};
|
||||
|
||||
Index { negative, positive }
|
||||
};
|
||||
|
||||
let env_options = rocksdb_options::EnvOptions::new();
|
||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||
file_writer.open(&self.sst_file.to_string_lossy())?;
|
||||
|
||||
// write the data-index
|
||||
let mut bytes = Vec::new();
|
||||
tree.write_to_bytes(&mut bytes);
|
||||
file_writer.merge(DATA_INDEX, &bytes)?;
|
||||
|
||||
// write all the documents attributes updates
|
||||
for (key, value) in self.keys_values {
|
||||
file_writer.put(key.as_ref(), &value)?;
|
||||
}
|
||||
|
||||
file_writer.finish()?;
|
||||
|
||||
Ok(Update { sst_file: self.sst_file })
|
||||
}
|
||||
}
|
@ -86,7 +86,7 @@ where D: Deref<Target=DB>,
|
||||
let mut stream = {
|
||||
let mut op_builder = fst::map::OpBuilder::new();
|
||||
for automaton in &automatons {
|
||||
let stream = self.view.blob().as_map().search(automaton);
|
||||
let stream = self.view.index().positive.map().search(automaton);
|
||||
op_builder.push(stream);
|
||||
}
|
||||
op_builder.union()
|
||||
@ -100,7 +100,7 @@ where D: Deref<Target=DB>,
|
||||
let distance = automaton.eval(input).to_u8();
|
||||
let is_exact = distance == 0 && input.len() == automaton.query_len();
|
||||
|
||||
let doc_indexes = self.view.blob().as_indexes();
|
||||
let doc_indexes = &self.view.index().positive.indexes();
|
||||
let doc_indexes = &doc_indexes[iv.value as usize];
|
||||
|
||||
for doc_index in doc_indexes {
|
||||
|
Loading…
Reference in New Issue
Block a user