feat: Index and store/serialize attributes while creating the update

This commit is contained in:
Clément Renault 2018-12-07 11:32:27 +01:00
parent 442834c28f
commit 731ed11153
No known key found for this signature in database
GPG Key ID: 0151CDAB43460DAE
7 changed files with 240 additions and 216 deletions

View File

@ -9,24 +9,9 @@ use crate::database::deserializer::{Deserializer, DeserializerError};
use crate::database::{DATA_INDEX, DATA_SCHEMA};
use crate::blob::positive::PositiveBlob;
use crate::index::schema::Schema;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::database::{retrieve_data_schema, DocumentKey, DocumentKeyAttr};
use crate::DocumentId;
// FIXME Do not panic!
fn retrieve_data_schema(snapshot: &Snapshot<&DB>) -> Result<Schema, Box<Error>> {
match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from(&*vector)?),
None => panic!("BUG: no schema found in the database"),
}
}
fn retrieve_data_index(snapshot: &Snapshot<&DB>) -> Result<PositiveBlob, Box<Error>> {
match snapshot.get(DATA_INDEX)? {
Some(vector) => Ok(bincode::deserialize(&*vector)?),
None => Ok(PositiveBlob::default()),
}
}
pub struct DatabaseView<'a> {
snapshot: Snapshot<&'a DB>,
schema: Schema,

View File

@ -1,15 +1,17 @@
use std::error::Error;
use std::path::Path;
use std::ops::Deref;
use std::fmt;
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
use rocksdb::{DB, DBVector, MergeOperands, SeekKey};
use rocksdb::rocksdb::Writable;
use rocksdb::rocksdb::{Writable, Snapshot};
pub use crate::database::database_view::DatabaseView;
pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
pub use crate::database::database_view::DatabaseView;
use crate::index::update::Update;
use crate::index::schema::Schema;
use crate::blob::positive::PositiveBlob;
use crate::blob::{self, Blob};
mod document_key;
@ -19,6 +21,24 @@ mod deserializer;
const DATA_INDEX: &[u8] = b"data-index";
const DATA_SCHEMA: &[u8] = b"data-schema";
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from(&*vector)?),
None => Err(String::from("BUG: no schema found in the database").into()),
}
}
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_INDEX)? {
Some(vector) => Ok(bincode::deserialize(&*vector)?),
None => Ok(PositiveBlob::default()),
}
}
pub struct Database(DB);
impl Database {
@ -162,14 +182,14 @@ mod tests {
struct SimpleDoc {
title: String,
description: String,
timestamp: u64,
}
let title;
let description;
let schema = {
let mut builder = SchemaBuilder::new();
title = builder.new_attribute("title", STORED | INDEXED);
description = builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
@ -181,21 +201,17 @@ mod tests {
let doc0 = SimpleDoc {
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let mut update = {
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
// builder.update_field(0, title, doc0.title.clone());
// builder.update_field(0, description, doc0.description.clone());
// builder.update_field(1, title, doc1.title.clone());
// builder.update_field(1, description, doc1.description.clone());
builder.update(0, &doc0).unwrap();
builder.update(1, &doc1).unwrap();
@ -206,19 +222,9 @@ mod tests {
database.ingest_update_file(update)?;
let view = database.view()?;
println!("{:?}", view);
#[derive(Deserialize, Debug, Clone, PartialEq, Eq)]
struct DeSimpleDoc {
title: char,
}
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
println!("{:?}", de_doc0);
println!("{:?}", de_doc1);
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);

View File

@ -25,6 +25,7 @@ pub struct PositiveUpdateBuilder<B> {
path: PathBuf,
schema: Schema,
tokenizer_builder: B,
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: BTreeMap<(DocumentId, SchemaAttr), NewState>,
}
@ -34,14 +35,19 @@ impl<B> PositiveUpdateBuilder<B> {
path: path.into(),
schema: schema,
tokenizer_builder: tokenizer_builder,
builder: UnorderedPositiveBlobBuilder::memory(),
new_states: BTreeMap::new(),
}
}
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>> {
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
where B: TokenizerBuilder
{
let serializer = Serializer {
schema: &self.schema,
document_id: id,
tokenizer_builder: &self.tokenizer_builder,
builder: &mut self.builder,
new_states: &mut self.new_states
};
@ -90,9 +96,11 @@ impl fmt::Display for SerializerError {
impl Error for SerializerError {}
struct Serializer<'a> {
struct Serializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
}
@ -106,7 +114,9 @@ macro_rules! forward_to_unserializable_type {
}
}
impl<'a> ser::Serializer for Serializer<'a> {
impl<'a, B> ser::Serializer for Serializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
@ -114,7 +124,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStruct = StructSerializer<'a, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
@ -238,7 +248,9 @@ impl<'a> ser::Serializer for Serializer<'a> {
{
Ok(StructSerializer {
schema: self.schema,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
builder: self.builder,
new_states: self.new_states,
})
}
@ -255,33 +267,17 @@ impl<'a> ser::Serializer for Serializer<'a> {
}
}
fn serialize_field(
schema: &Schema,
document_id: DocumentId,
new_states: &mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
name: &str,
value: Vec<u8>,
) -> Result<(), SerializerError>
{
match schema.attribute(name) {
Some(attr) => {
let props = schema.props(attr);
if props.is_stored() {
new_states.insert((document_id, attr), NewState::Updated { value });
}
Ok(())
},
None => Err(SerializerError::SchemaDontMatch { attribute: name.to_owned() }),
}
}
struct StructSerializer<'a> {
struct StructSerializer<'a, B> {
schema: &'a Schema,
tokenizer_builder: &'a B,
document_id: DocumentId,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
@ -292,11 +288,26 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
) -> Result<(), Self::Error>
where T: Serialize,
{
let value = match bincode::serialize(value) {
Ok(value) => value,
Err(e) => return Err(SerializerError::UnserializableType { name: "???" }),
};
serialize_field(self.schema, self.document_id, self.new_states, key, value)
match self.schema.attribute(key) {
Some(attr) => {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.new_states.insert((self.document_id, attr), NewState::Updated { value });
}
if props.is_indexed() {
let serializer = IndexerSerializer {
builder: self.builder,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
};
value.serialize(serializer)?;
}
Ok(())
},
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
}
}
fn end(self) -> Result<Self::Ok, Self::Error> {
@ -304,52 +315,181 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
}
}
impl<B> PositiveUpdateBuilder<B>
struct IndexerSerializer<'a, B> {
tokenizer_builder: &'a B,
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
document_id: DocumentId,
attribute: SchemaAttr,
}
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for (index, word) in self.tokenizer_builder.build(v) {
let doc_index = DocIndex {
document_id: self.document_id,
attribute: self.attribute.as_u32() as u8,
attribute_index: index as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
self.builder.insert(word_unidecoded, doc_index);
}
self.builder.insert(word_lower, doc_index);
}
Ok(())
}
fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
impl<B> PositiveUpdateBuilder<B> {
pub fn build(self) -> Result<Update, Box<Error>> {
let env_options = rocksdb_options::EnvOptions::new();
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
file_writer.open(&self.path.to_string_lossy())?;
let mut builder = UnorderedPositiveBlobBuilder::memory();
for ((document_id, attr), state) in &self.new_states {
let props = self.schema.props(*attr);
let value = match state {
NewState::Updated { value } if props.is_indexed() => value,
_ => continue,
};
let value: String = match bincode::deserialize(&value) {
Ok(value) => value,
Err(e) => {
eprintln!("{}", e);
continue
},
};
for (index, word) in self.tokenizer_builder.build(&value) {
let doc_index = DocIndex {
document_id: *document_id,
attribute: attr.as_u32() as u8,
attribute_index: index as u32,
};
// insert the exact representation
let word_lower = word.to_lowercase();
// and the unidecoded lowercased version
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
if word_lower != word_unidecoded {
builder.insert(word_unidecoded, doc_index);
}
builder.insert(word_lower, doc_index);
}
}
let (blob_fst_map, blob_doc_idx) = builder.into_inner()?;
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
let blob = Blob::Positive(positive_blob);

View File

@ -2,7 +2,6 @@ pub mod automaton;
pub mod blob;
pub mod database;
pub mod data;
pub mod retrieve;
pub mod index;
pub mod rank;
pub mod tokenizer;

View File

@ -11,9 +11,9 @@ use fst::Streamer;
use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::criterion::{self, Criterion};
use crate::rank::distinct_map::DistinctMap;
use crate::database::retrieve_data_index;
use crate::blob::PositiveBlob;
use crate::{Match, DocumentId};
use crate::retrieve::Retrieve;
use crate::rank::Document;
fn clamp_range<T: Copy + Ord>(range: Range<T>, big: Range<T>) -> Range<T> {
@ -48,7 +48,7 @@ impl<T, C> QueryBuilder<T, C>
where T: Deref<Target=DB>,
{
pub fn with_criteria(snapshot: Snapshot<T>, criteria: Vec<C>) -> Result<Self, Box<Error>> {
let blob = snapshot.data_index()?;
let blob = retrieve_data_index(&snapshot)?;
Ok(QueryBuilder { snapshot, blob, criteria })
}

View File

@ -1,60 +0,0 @@
use std::error::Error;
use std::ops::Deref;
use ::rocksdb::rocksdb::{DB, Snapshot, DBVector};
use crate::index::schema::{Schema, SchemaAttr};
use crate::blob::PositiveBlob;
use crate::DocumentId;
pub struct DocDatabase<'a, R: ?Sized> {
retrieve: &'a R,
schema: Schema,
}
impl<'a, R> DocDatabase<'a, R> {
pub fn get_document<D>(&self, id: DocumentId) -> Result<Option<D>, Box<Error>> {
// if ids.is_empty() { return Ok(Vec::new()) }
unimplemented!()
}
pub fn get_document_attribute(&self, id: DocumentId, attr: SchemaAttr) -> Result<DBVector, Box<Error>> {
unimplemented!()
}
}
pub trait Retrieve {
fn schema(&self) -> Result<Option<Schema>, Box<Error>>;
fn data_index(&self) -> Result<PositiveBlob, Box<Error>>;
fn doc_database(&self) -> Result<DocDatabase<Self>, Box<Error>>;
}
impl<T> Retrieve for Snapshot<T>
where T: Deref<Target=DB>,
{
fn schema(&self) -> Result<Option<Schema>, Box<Error>> {
match self.deref().get(b"data-schema")? {
Some(value) => Ok(Some(Schema::read_from(&*value)?)),
None => Ok(None),
}
}
fn data_index(&self) -> Result<PositiveBlob, Box<Error>> {
match self.deref().get(b"data-index")? {
Some(value) => Ok(bincode::deserialize(&value)?),
None => Ok(PositiveBlob::default()),
}
}
fn doc_database(&self) -> Result<DocDatabase<Self>, Box<Error>> {
let schema = match self.schema()? {
Some(schema) => schema,
None => return Err(String::from("BUG: could not find schema").into()),
};
Ok(DocDatabase {
retrieve: self,
schema: schema,
})
}
}

View File

@ -1,52 +1,6 @@
use std::mem;
use self::Separator::*;
struct MegaTokenizer<I> {
strings: I,
}
impl From<String> for MegaTokenizer<Option<String>> {
fn from(string: String) -> Self {
MegaTokenizer { strings: Some(string) }
}
}
impl From<Vec<String>> for MegaTokenizer<Vec<String>> {
fn from(strings: Vec<String>) -> Self {
MegaTokenizer { strings }
}
}
impl<I> Iterator for MegaTokenizer<I> {
type Item = (usize, String);
fn next(&mut self) -> Option<Self::Item> {
unimplemented!()
}
}
#[test]
fn xxx() {
let s1 = "hello world!";
let mut s1 = MegaTokenizer::from(s1.to_owned());
assert_eq!(s1.next(), Some((0, "hello".into())));
assert_eq!(s1.next(), Some((1, "world".into())));
assert_eq!(s1.next(), None);
let v1 = vec!["Vin Diesel".to_owned(), "Quentin Tarantino".to_owned()];
let mut v1 = MegaTokenizer::from(v1);
assert_eq!(v1.next(), Some((0, "Vin".into())));
assert_eq!(v1.next(), Some((1, "Diesel".into())));
assert_eq!(v1.next(), Some((8, "Quentin".into())));
assert_eq!(v1.next(), Some((9, "Tarantino".into())));
assert_eq!(v1.next(), None);
}
pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
}