mirror of
https://github.com/meilisearch/MeiliSearch
synced 2024-11-26 14:54:27 +01:00
feat: Index and store/serialize attributes while creating the update
This commit is contained in:
parent
442834c28f
commit
731ed11153
@ -9,24 +9,9 @@ use crate::database::deserializer::{Deserializer, DeserializerError};
|
|||||||
use crate::database::{DATA_INDEX, DATA_SCHEMA};
|
use crate::database::{DATA_INDEX, DATA_SCHEMA};
|
||||||
use crate::blob::positive::PositiveBlob;
|
use crate::blob::positive::PositiveBlob;
|
||||||
use crate::index::schema::Schema;
|
use crate::index::schema::Schema;
|
||||||
use crate::database::{DocumentKey, DocumentKeyAttr};
|
use crate::database::{retrieve_data_schema, DocumentKey, DocumentKeyAttr};
|
||||||
use crate::DocumentId;
|
use crate::DocumentId;
|
||||||
|
|
||||||
// FIXME Do not panic!
|
|
||||||
fn retrieve_data_schema(snapshot: &Snapshot<&DB>) -> Result<Schema, Box<Error>> {
|
|
||||||
match snapshot.get(DATA_SCHEMA)? {
|
|
||||||
Some(vector) => Ok(Schema::read_from(&*vector)?),
|
|
||||||
None => panic!("BUG: no schema found in the database"),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn retrieve_data_index(snapshot: &Snapshot<&DB>) -> Result<PositiveBlob, Box<Error>> {
|
|
||||||
match snapshot.get(DATA_INDEX)? {
|
|
||||||
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
|
||||||
None => Ok(PositiveBlob::default()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub struct DatabaseView<'a> {
|
pub struct DatabaseView<'a> {
|
||||||
snapshot: Snapshot<&'a DB>,
|
snapshot: Snapshot<&'a DB>,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
|
@ -1,15 +1,17 @@
|
|||||||
use std::error::Error;
|
use std::error::Error;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::ops::Deref;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
|
use rocksdb::rocksdb_options::{DBOptions, IngestExternalFileOptions, ColumnFamilyOptions};
|
||||||
use rocksdb::{DB, DBVector, MergeOperands, SeekKey};
|
use rocksdb::{DB, DBVector, MergeOperands, SeekKey};
|
||||||
use rocksdb::rocksdb::Writable;
|
use rocksdb::rocksdb::{Writable, Snapshot};
|
||||||
|
|
||||||
pub use crate::database::database_view::DatabaseView;
|
|
||||||
pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
pub use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
|
||||||
|
pub use crate::database::database_view::DatabaseView;
|
||||||
use crate::index::update::Update;
|
use crate::index::update::Update;
|
||||||
use crate::index::schema::Schema;
|
use crate::index::schema::Schema;
|
||||||
|
use crate::blob::positive::PositiveBlob;
|
||||||
use crate::blob::{self, Blob};
|
use crate::blob::{self, Blob};
|
||||||
|
|
||||||
mod document_key;
|
mod document_key;
|
||||||
@ -19,6 +21,24 @@ mod deserializer;
|
|||||||
const DATA_INDEX: &[u8] = b"data-index";
|
const DATA_INDEX: &[u8] = b"data-index";
|
||||||
const DATA_SCHEMA: &[u8] = b"data-schema";
|
const DATA_SCHEMA: &[u8] = b"data-schema";
|
||||||
|
|
||||||
|
pub fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
|
||||||
|
where D: Deref<Target=DB>
|
||||||
|
{
|
||||||
|
match snapshot.get(DATA_SCHEMA)? {
|
||||||
|
Some(vector) => Ok(Schema::read_from(&*vector)?),
|
||||||
|
None => Err(String::from("BUG: no schema found in the database").into()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<PositiveBlob, Box<Error>>
|
||||||
|
where D: Deref<Target=DB>
|
||||||
|
{
|
||||||
|
match snapshot.get(DATA_INDEX)? {
|
||||||
|
Some(vector) => Ok(bincode::deserialize(&*vector)?),
|
||||||
|
None => Ok(PositiveBlob::default()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
pub struct Database(DB);
|
pub struct Database(DB);
|
||||||
|
|
||||||
impl Database {
|
impl Database {
|
||||||
@ -162,14 +182,14 @@ mod tests {
|
|||||||
struct SimpleDoc {
|
struct SimpleDoc {
|
||||||
title: String,
|
title: String,
|
||||||
description: String,
|
description: String,
|
||||||
|
timestamp: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
let title;
|
|
||||||
let description;
|
|
||||||
let schema = {
|
let schema = {
|
||||||
let mut builder = SchemaBuilder::new();
|
let mut builder = SchemaBuilder::new();
|
||||||
title = builder.new_attribute("title", STORED | INDEXED);
|
builder.new_attribute("title", STORED | INDEXED);
|
||||||
description = builder.new_attribute("description", STORED | INDEXED);
|
builder.new_attribute("description", STORED | INDEXED);
|
||||||
|
builder.new_attribute("timestamp", STORED);
|
||||||
builder.build()
|
builder.build()
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -181,21 +201,17 @@ mod tests {
|
|||||||
let doc0 = SimpleDoc {
|
let doc0 = SimpleDoc {
|
||||||
title: String::from("I am a title"),
|
title: String::from("I am a title"),
|
||||||
description: String::from("I am a description"),
|
description: String::from("I am a description"),
|
||||||
|
timestamp: 1234567,
|
||||||
};
|
};
|
||||||
let doc1 = SimpleDoc {
|
let doc1 = SimpleDoc {
|
||||||
title: String::from("I am the second title"),
|
title: String::from("I am the second title"),
|
||||||
description: String::from("I am the second description"),
|
description: String::from("I am the second description"),
|
||||||
|
timestamp: 7654321,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut update = {
|
let mut update = {
|
||||||
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
|
let mut builder = PositiveUpdateBuilder::new(update_path, schema, tokenizer_builder);
|
||||||
|
|
||||||
// builder.update_field(0, title, doc0.title.clone());
|
|
||||||
// builder.update_field(0, description, doc0.description.clone());
|
|
||||||
|
|
||||||
// builder.update_field(1, title, doc1.title.clone());
|
|
||||||
// builder.update_field(1, description, doc1.description.clone());
|
|
||||||
|
|
||||||
builder.update(0, &doc0).unwrap();
|
builder.update(0, &doc0).unwrap();
|
||||||
builder.update(1, &doc1).unwrap();
|
builder.update(1, &doc1).unwrap();
|
||||||
|
|
||||||
@ -206,19 +222,9 @@ mod tests {
|
|||||||
database.ingest_update_file(update)?;
|
database.ingest_update_file(update)?;
|
||||||
let view = database.view()?;
|
let view = database.view()?;
|
||||||
|
|
||||||
println!("{:?}", view);
|
|
||||||
|
|
||||||
#[derive(Deserialize, Debug, Clone, PartialEq, Eq)]
|
|
||||||
struct DeSimpleDoc {
|
|
||||||
title: char,
|
|
||||||
}
|
|
||||||
|
|
||||||
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
|
let de_doc0: SimpleDoc = view.retrieve_document(0)?;
|
||||||
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
|
let de_doc1: SimpleDoc = view.retrieve_document(1)?;
|
||||||
|
|
||||||
println!("{:?}", de_doc0);
|
|
||||||
println!("{:?}", de_doc1);
|
|
||||||
|
|
||||||
assert_eq!(doc0, de_doc0);
|
assert_eq!(doc0, de_doc0);
|
||||||
assert_eq!(doc1, de_doc1);
|
assert_eq!(doc1, de_doc1);
|
||||||
|
|
||||||
|
@ -25,6 +25,7 @@ pub struct PositiveUpdateBuilder<B> {
|
|||||||
path: PathBuf,
|
path: PathBuf,
|
||||||
schema: Schema,
|
schema: Schema,
|
||||||
tokenizer_builder: B,
|
tokenizer_builder: B,
|
||||||
|
builder: UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||||
new_states: BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
new_states: BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -34,14 +35,19 @@ impl<B> PositiveUpdateBuilder<B> {
|
|||||||
path: path.into(),
|
path: path.into(),
|
||||||
schema: schema,
|
schema: schema,
|
||||||
tokenizer_builder: tokenizer_builder,
|
tokenizer_builder: tokenizer_builder,
|
||||||
|
builder: UnorderedPositiveBlobBuilder::memory(),
|
||||||
new_states: BTreeMap::new(),
|
new_states: BTreeMap::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>> {
|
pub fn update<T: Serialize>(&mut self, id: DocumentId, document: &T) -> Result<(), Box<Error>>
|
||||||
|
where B: TokenizerBuilder
|
||||||
|
{
|
||||||
let serializer = Serializer {
|
let serializer = Serializer {
|
||||||
schema: &self.schema,
|
schema: &self.schema,
|
||||||
document_id: id,
|
document_id: id,
|
||||||
|
tokenizer_builder: &self.tokenizer_builder,
|
||||||
|
builder: &mut self.builder,
|
||||||
new_states: &mut self.new_states
|
new_states: &mut self.new_states
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -90,9 +96,11 @@ impl fmt::Display for SerializerError {
|
|||||||
|
|
||||||
impl Error for SerializerError {}
|
impl Error for SerializerError {}
|
||||||
|
|
||||||
struct Serializer<'a> {
|
struct Serializer<'a, B> {
|
||||||
schema: &'a Schema,
|
schema: &'a Schema,
|
||||||
|
tokenizer_builder: &'a B,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
|
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||||
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -106,7 +114,9 @@ macro_rules! forward_to_unserializable_type {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::Serializer for Serializer<'a> {
|
impl<'a, B> ser::Serializer for Serializer<'a, B>
|
||||||
|
where B: TokenizerBuilder
|
||||||
|
{
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
@ -114,7 +124,7 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
type SerializeStruct = StructSerializer<'a>;
|
type SerializeStruct = StructSerializer<'a, B>;
|
||||||
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
|
||||||
forward_to_unserializable_type! {
|
forward_to_unserializable_type! {
|
||||||
@ -238,7 +248,9 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
{
|
{
|
||||||
Ok(StructSerializer {
|
Ok(StructSerializer {
|
||||||
schema: self.schema,
|
schema: self.schema,
|
||||||
|
tokenizer_builder: self.tokenizer_builder,
|
||||||
document_id: self.document_id,
|
document_id: self.document_id,
|
||||||
|
builder: self.builder,
|
||||||
new_states: self.new_states,
|
new_states: self.new_states,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
@ -255,33 +267,17 @@ impl<'a> ser::Serializer for Serializer<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn serialize_field(
|
struct StructSerializer<'a, B> {
|
||||||
schema: &Schema,
|
|
||||||
document_id: DocumentId,
|
|
||||||
new_states: &mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
|
||||||
name: &str,
|
|
||||||
value: Vec<u8>,
|
|
||||||
) -> Result<(), SerializerError>
|
|
||||||
{
|
|
||||||
match schema.attribute(name) {
|
|
||||||
Some(attr) => {
|
|
||||||
let props = schema.props(attr);
|
|
||||||
if props.is_stored() {
|
|
||||||
new_states.insert((document_id, attr), NewState::Updated { value });
|
|
||||||
}
|
|
||||||
Ok(())
|
|
||||||
},
|
|
||||||
None => Err(SerializerError::SchemaDontMatch { attribute: name.to_owned() }),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
struct StructSerializer<'a> {
|
|
||||||
schema: &'a Schema,
|
schema: &'a Schema,
|
||||||
|
tokenizer_builder: &'a B,
|
||||||
document_id: DocumentId,
|
document_id: DocumentId,
|
||||||
|
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||||
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
new_states: &'a mut BTreeMap<(DocumentId, SchemaAttr), NewState>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
impl<'a, B> ser::SerializeStruct for StructSerializer<'a, B>
|
||||||
|
where B: TokenizerBuilder
|
||||||
|
{
|
||||||
type Ok = ();
|
type Ok = ();
|
||||||
type Error = SerializerError;
|
type Error = SerializerError;
|
||||||
|
|
||||||
@ -292,11 +288,26 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
) -> Result<(), Self::Error>
|
) -> Result<(), Self::Error>
|
||||||
where T: Serialize,
|
where T: Serialize,
|
||||||
{
|
{
|
||||||
let value = match bincode::serialize(value) {
|
match self.schema.attribute(key) {
|
||||||
Ok(value) => value,
|
Some(attr) => {
|
||||||
Err(e) => return Err(SerializerError::UnserializableType { name: "???" }),
|
let props = self.schema.props(attr);
|
||||||
|
if props.is_stored() {
|
||||||
|
let value = bincode::serialize(value).unwrap();
|
||||||
|
self.new_states.insert((self.document_id, attr), NewState::Updated { value });
|
||||||
|
}
|
||||||
|
if props.is_indexed() {
|
||||||
|
let serializer = IndexerSerializer {
|
||||||
|
builder: self.builder,
|
||||||
|
tokenizer_builder: self.tokenizer_builder,
|
||||||
|
document_id: self.document_id,
|
||||||
|
attribute: attr,
|
||||||
};
|
};
|
||||||
serialize_field(self.schema, self.document_id, self.new_states, key, value)
|
value.serialize(serializer)?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
None => Err(SerializerError::SchemaDontMatch { attribute: key.to_owned() }),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn end(self) -> Result<Self::Ok, Self::Error> {
|
fn end(self) -> Result<Self::Ok, Self::Error> {
|
||||||
@ -304,35 +315,49 @@ impl<'a> ser::SerializeStruct for StructSerializer<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<B> PositiveUpdateBuilder<B>
|
struct IndexerSerializer<'a, B> {
|
||||||
|
tokenizer_builder: &'a B,
|
||||||
|
builder: &'a mut UnorderedPositiveBlobBuilder<Vec<u8>, Vec<u8>>,
|
||||||
|
document_id: DocumentId,
|
||||||
|
attribute: SchemaAttr,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a, B> ser::Serializer for IndexerSerializer<'a, B>
|
||||||
where B: TokenizerBuilder
|
where B: TokenizerBuilder
|
||||||
{
|
{
|
||||||
pub fn build(self) -> Result<Update, Box<Error>> {
|
type Ok = ();
|
||||||
let env_options = rocksdb_options::EnvOptions::new();
|
type Error = SerializerError;
|
||||||
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
file_writer.open(&self.path.to_string_lossy())?;
|
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
|
||||||
|
|
||||||
let mut builder = UnorderedPositiveBlobBuilder::memory();
|
forward_to_unserializable_type! {
|
||||||
for ((document_id, attr), state) in &self.new_states {
|
bool => serialize_bool,
|
||||||
let props = self.schema.props(*attr);
|
char => serialize_char,
|
||||||
let value = match state {
|
|
||||||
NewState::Updated { value } if props.is_indexed() => value,
|
|
||||||
_ => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
let value: String = match bincode::deserialize(&value) {
|
i8 => serialize_i8,
|
||||||
Ok(value) => value,
|
i16 => serialize_i16,
|
||||||
Err(e) => {
|
i32 => serialize_i32,
|
||||||
eprintln!("{}", e);
|
i64 => serialize_i64,
|
||||||
continue
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
for (index, word) in self.tokenizer_builder.build(&value) {
|
u8 => serialize_u8,
|
||||||
|
u16 => serialize_u16,
|
||||||
|
u32 => serialize_u32,
|
||||||
|
u64 => serialize_u64,
|
||||||
|
|
||||||
|
f32 => serialize_f32,
|
||||||
|
f64 => serialize_f64,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
|
||||||
|
for (index, word) in self.tokenizer_builder.build(v) {
|
||||||
let doc_index = DocIndex {
|
let doc_index = DocIndex {
|
||||||
document_id: *document_id,
|
document_id: self.document_id,
|
||||||
attribute: attr.as_u32() as u8,
|
attribute: self.attribute.as_u32() as u8,
|
||||||
attribute_index: index as u32,
|
attribute_index: index as u32,
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -342,14 +367,129 @@ where B: TokenizerBuilder
|
|||||||
// and the unidecoded lowercased version
|
// and the unidecoded lowercased version
|
||||||
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
|
||||||
if word_lower != word_unidecoded {
|
if word_lower != word_unidecoded {
|
||||||
builder.insert(word_unidecoded, doc_index);
|
self.builder.insert(word_unidecoded, doc_index);
|
||||||
}
|
}
|
||||||
|
|
||||||
builder.insert(word_lower, doc_index);
|
self.builder.insert(word_lower, doc_index);
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_bytes(self, v: &[u8]) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "&[u8]" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "Option" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
|
||||||
|
where T: Serialize,
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "Option" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "()" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "unit struct" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_unit_variant(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str
|
||||||
|
) -> Result<Self::Ok, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "unit variant" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_newtype_struct<T: ?Sized>(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
value: &T
|
||||||
|
) -> Result<Self::Ok, Self::Error>
|
||||||
|
where T: Serialize,
|
||||||
|
{
|
||||||
|
value.serialize(self)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_newtype_variant<T: ?Sized>(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str,
|
||||||
|
_value: &T
|
||||||
|
) -> Result<Self::Ok, Self::Error>
|
||||||
|
where T: Serialize,
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "newtype variant" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "seq" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "tuple" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_tuple_struct(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeTupleStruct, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "tuple struct" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_tuple_variant(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeTupleVariant, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "tuple variant" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
|
||||||
|
Err(SerializerError::UnserializableType { name: "map" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_struct(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeStruct, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "struct" })
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize_struct_variant(
|
||||||
|
self,
|
||||||
|
_name: &'static str,
|
||||||
|
_variant_index: u32,
|
||||||
|
_variant: &'static str,
|
||||||
|
_len: usize
|
||||||
|
) -> Result<Self::SerializeStructVariant, Self::Error>
|
||||||
|
{
|
||||||
|
Err(SerializerError::UnserializableType { name: "struct variant" })
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let (blob_fst_map, blob_doc_idx) = builder.into_inner()?;
|
impl<B> PositiveUpdateBuilder<B> {
|
||||||
|
pub fn build(self) -> Result<Update, Box<Error>> {
|
||||||
|
let env_options = rocksdb_options::EnvOptions::new();
|
||||||
|
let column_family_options = rocksdb_options::ColumnFamilyOptions::new();
|
||||||
|
let mut file_writer = rocksdb::SstFileWriter::new(env_options, column_family_options);
|
||||||
|
file_writer.open(&self.path.to_string_lossy())?;
|
||||||
|
|
||||||
|
let (blob_fst_map, blob_doc_idx) = self.builder.into_inner()?;
|
||||||
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
|
let positive_blob = PositiveBlob::from_bytes(blob_fst_map, blob_doc_idx)?;
|
||||||
let blob = Blob::Positive(positive_blob);
|
let blob = Blob::Positive(positive_blob);
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@ pub mod automaton;
|
|||||||
pub mod blob;
|
pub mod blob;
|
||||||
pub mod database;
|
pub mod database;
|
||||||
pub mod data;
|
pub mod data;
|
||||||
pub mod retrieve;
|
|
||||||
pub mod index;
|
pub mod index;
|
||||||
pub mod rank;
|
pub mod rank;
|
||||||
pub mod tokenizer;
|
pub mod tokenizer;
|
||||||
|
@ -11,9 +11,9 @@ use fst::Streamer;
|
|||||||
use crate::automaton::{self, DfaExt, AutomatonExt};
|
use crate::automaton::{self, DfaExt, AutomatonExt};
|
||||||
use crate::rank::criterion::{self, Criterion};
|
use crate::rank::criterion::{self, Criterion};
|
||||||
use crate::rank::distinct_map::DistinctMap;
|
use crate::rank::distinct_map::DistinctMap;
|
||||||
|
use crate::database::retrieve_data_index;
|
||||||
use crate::blob::PositiveBlob;
|
use crate::blob::PositiveBlob;
|
||||||
use crate::{Match, DocumentId};
|
use crate::{Match, DocumentId};
|
||||||
use crate::retrieve::Retrieve;
|
|
||||||
use crate::rank::Document;
|
use crate::rank::Document;
|
||||||
|
|
||||||
fn clamp_range<T: Copy + Ord>(range: Range<T>, big: Range<T>) -> Range<T> {
|
fn clamp_range<T: Copy + Ord>(range: Range<T>, big: Range<T>) -> Range<T> {
|
||||||
@ -48,7 +48,7 @@ impl<T, C> QueryBuilder<T, C>
|
|||||||
where T: Deref<Target=DB>,
|
where T: Deref<Target=DB>,
|
||||||
{
|
{
|
||||||
pub fn with_criteria(snapshot: Snapshot<T>, criteria: Vec<C>) -> Result<Self, Box<Error>> {
|
pub fn with_criteria(snapshot: Snapshot<T>, criteria: Vec<C>) -> Result<Self, Box<Error>> {
|
||||||
let blob = snapshot.data_index()?;
|
let blob = retrieve_data_index(&snapshot)?;
|
||||||
Ok(QueryBuilder { snapshot, blob, criteria })
|
Ok(QueryBuilder { snapshot, blob, criteria })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,60 +0,0 @@
|
|||||||
use std::error::Error;
|
|
||||||
use std::ops::Deref;
|
|
||||||
|
|
||||||
use ::rocksdb::rocksdb::{DB, Snapshot, DBVector};
|
|
||||||
|
|
||||||
use crate::index::schema::{Schema, SchemaAttr};
|
|
||||||
use crate::blob::PositiveBlob;
|
|
||||||
use crate::DocumentId;
|
|
||||||
|
|
||||||
pub struct DocDatabase<'a, R: ?Sized> {
|
|
||||||
retrieve: &'a R,
|
|
||||||
schema: Schema,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'a, R> DocDatabase<'a, R> {
|
|
||||||
pub fn get_document<D>(&self, id: DocumentId) -> Result<Option<D>, Box<Error>> {
|
|
||||||
// if ids.is_empty() { return Ok(Vec::new()) }
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn get_document_attribute(&self, id: DocumentId, attr: SchemaAttr) -> Result<DBVector, Box<Error>> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
pub trait Retrieve {
|
|
||||||
fn schema(&self) -> Result<Option<Schema>, Box<Error>>;
|
|
||||||
fn data_index(&self) -> Result<PositiveBlob, Box<Error>>;
|
|
||||||
fn doc_database(&self) -> Result<DocDatabase<Self>, Box<Error>>;
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<T> Retrieve for Snapshot<T>
|
|
||||||
where T: Deref<Target=DB>,
|
|
||||||
{
|
|
||||||
fn schema(&self) -> Result<Option<Schema>, Box<Error>> {
|
|
||||||
match self.deref().get(b"data-schema")? {
|
|
||||||
Some(value) => Ok(Some(Schema::read_from(&*value)?)),
|
|
||||||
None => Ok(None),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn data_index(&self) -> Result<PositiveBlob, Box<Error>> {
|
|
||||||
match self.deref().get(b"data-index")? {
|
|
||||||
Some(value) => Ok(bincode::deserialize(&value)?),
|
|
||||||
None => Ok(PositiveBlob::default()),
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn doc_database(&self) -> Result<DocDatabase<Self>, Box<Error>> {
|
|
||||||
let schema = match self.schema()? {
|
|
||||||
Some(schema) => schema,
|
|
||||||
None => return Err(String::from("BUG: could not find schema").into()),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(DocDatabase {
|
|
||||||
retrieve: self,
|
|
||||||
schema: schema,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,52 +1,6 @@
|
|||||||
use std::mem;
|
use std::mem;
|
||||||
use self::Separator::*;
|
use self::Separator::*;
|
||||||
|
|
||||||
struct MegaTokenizer<I> {
|
|
||||||
strings: I,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<String> for MegaTokenizer<Option<String>> {
|
|
||||||
fn from(string: String) -> Self {
|
|
||||||
MegaTokenizer { strings: Some(string) }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl From<Vec<String>> for MegaTokenizer<Vec<String>> {
|
|
||||||
fn from(strings: Vec<String>) -> Self {
|
|
||||||
MegaTokenizer { strings }
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<I> Iterator for MegaTokenizer<I> {
|
|
||||||
type Item = (usize, String);
|
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
|
||||||
unimplemented!()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn xxx() {
|
|
||||||
let s1 = "hello world!";
|
|
||||||
let mut s1 = MegaTokenizer::from(s1.to_owned());
|
|
||||||
|
|
||||||
assert_eq!(s1.next(), Some((0, "hello".into())));
|
|
||||||
assert_eq!(s1.next(), Some((1, "world".into())));
|
|
||||||
|
|
||||||
assert_eq!(s1.next(), None);
|
|
||||||
|
|
||||||
let v1 = vec!["Vin Diesel".to_owned(), "Quentin Tarantino".to_owned()];
|
|
||||||
let mut v1 = MegaTokenizer::from(v1);
|
|
||||||
|
|
||||||
assert_eq!(v1.next(), Some((0, "Vin".into())));
|
|
||||||
assert_eq!(v1.next(), Some((1, "Diesel".into())));
|
|
||||||
|
|
||||||
assert_eq!(v1.next(), Some((8, "Quentin".into())));
|
|
||||||
assert_eq!(v1.next(), Some((9, "Tarantino".into())));
|
|
||||||
|
|
||||||
assert_eq!(v1.next(), None);
|
|
||||||
}
|
|
||||||
|
|
||||||
pub trait TokenizerBuilder {
|
pub trait TokenizerBuilder {
|
||||||
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
|
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=(usize, &'a str)> + 'a>;
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user