Merge pull request #105 from Kerollmops/custom-ranking-field-into-hashmap

Save the custom ranking field into an HashMap
This commit is contained in:
Clément Renault 2019-02-11 17:36:26 +01:00 committed by GitHub
commit cf58cf86da
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 514 additions and 110 deletions

View File

@ -1,3 +1,5 @@
use crate::DocumentId;
use crate::database::schema::SchemaAttr;
use std::sync::Arc; use std::sync::Arc;
use std::error::Error; use std::error::Error;
use std::ffi::OsStr; use std::ffi::OsStr;
@ -6,12 +8,13 @@ use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering}; use std::sync::atomic::{AtomicBool, Ordering};
use std::ops::{Deref, DerefMut}; use std::ops::{Deref, DerefMut};
use crossbeam::atomic::ArcCell;
use log::{info, error, warn};
use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions}; use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, MergeOperands}; use rocksdb::{DB, MergeOperands};
use crossbeam::atomic::ArcCell;
use lockfree::map::Map; use lockfree::map::Map;
use hashbrown::HashMap;
use log::{info, error, warn};
pub use self::document_key::{DocumentKey, DocumentKeyAttr}; pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::view::{DatabaseView, DocumentIter}; pub use self::view::{DatabaseView, DocumentIter};
@ -19,12 +22,17 @@ pub use self::update::Update;
pub use self::serde::SerializerError; pub use self::serde::SerializerError;
pub use self::schema::Schema; pub use self::schema::Schema;
pub use self::index::Index; pub use self::index::Index;
pub use self::number::{Number, ParseNumberError};
const DATA_INDEX: &[u8] = b"data-index"; pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
const DATA_SCHEMA: &[u8] = b"data-schema";
const DATA_INDEX: &[u8] = b"data-index";
const DATA_RANKED_MAP: &[u8] = b"data-ranked-map";
const DATA_SCHEMA: &[u8] = b"data-schema";
pub mod schema; pub mod schema;
pub(crate) mod index; pub(crate) mod index;
mod number;
mod document_key; mod document_key;
mod serde; mod serde;
mod update; mod update;
@ -61,9 +69,16 @@ where D: Deref<Target=DB>
Ok(index) Ok(index)
} }
fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> { fn retrieve_data_ranked_map<D>(snapshot: &Snapshot<D>) -> Result<RankedMap, Box<Error>>
assert_eq!(key, DATA_INDEX, "The merge operator only supports \"data-index\" merging"); where D: Deref<Target=DB>,
{
match snapshot.get(DATA_RANKED_MAP)? {
Some(vector) => Ok(bincode::deserialize(&*vector)?),
None => Ok(HashMap::new()),
}
}
fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
let mut index: Option<Index> = None; let mut index: Option<Index> = None;
for bytes in existing.into_iter().chain(operands) { for bytes in existing.into_iter().chain(operands) {
let operand = Index::from_bytes(bytes.to_vec()).unwrap(); let operand = Index::from_bytes(bytes.to_vec()).unwrap();
@ -81,6 +96,28 @@ fn merge_indexes(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperan
bytes bytes
} }
fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
let mut ranked_map: Option<RankedMap> = None;
for bytes in existing.into_iter().chain(operands) {
let operand: RankedMap = bincode::deserialize(bytes).unwrap();
match ranked_map {
Some(ref mut ranked_map) => ranked_map.extend(operand),
None => { ranked_map.replace(operand); },
};
}
let ranked_map = ranked_map.unwrap_or_default();
bincode::serialize(&ranked_map).unwrap()
}
fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
match key {
DATA_INDEX => merge_indexes(existing, operands),
DATA_RANKED_MAP => merge_ranked_maps(existing, operands),
key => panic!("The merge operator does not support merging {:?}", key),
}
}
pub struct IndexUpdate { pub struct IndexUpdate {
index: String, index: String,
update: Update, update: Update,
@ -103,14 +140,14 @@ impl DerefMut for IndexUpdate {
struct DatabaseIndex { struct DatabaseIndex {
db: Arc<DB>, db: Arc<DB>,
// This view is updated each time the DB ingests an update // This view is updated each time the DB ingests an update.
view: ArcCell<DatabaseView<Arc<DB>>>, view: ArcCell<DatabaseView<Arc<DB>>>,
// This path is the path to the mdb folder stored on disk // The path of the mdb folder stored on disk.
path: PathBuf, path: PathBuf,
// must_die false by default, must be set as true when the Index is dropped. // must_die false by default, must be set as true when the Index is dropped.
// It's used to erase the folder saved on disk when the user request to delete an index // It is used to erase the folder saved on disk when the user request to delete an index.
must_die: AtomicBool, must_die: AtomicBool,
} }
@ -128,7 +165,7 @@ impl DatabaseIndex {
// opts.error_if_exists(true); // FIXME pull request that // opts.error_if_exists(true); // FIXME pull request that
let mut cf_opts = ColumnFamilyOptions::new(); let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes); cf_opts.add_merge_operator("data merge operator", merge_operator);
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
@ -140,7 +177,6 @@ impl DatabaseIndex {
let snapshot = Snapshot::new(db.clone()); let snapshot = Snapshot::new(db.clone());
let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?)); let view = ArcCell::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(DatabaseIndex { Ok(DatabaseIndex {
db: db, db: db,
view: view, view: view,
@ -156,7 +192,7 @@ impl DatabaseIndex {
opts.create_if_missing(false); opts.create_if_missing(false);
let mut cf_opts = ColumnFamilyOptions::new(); let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data-index merge operator", merge_indexes); cf_opts.add_merge_operator("data merge operator", merge_operator);
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;

98
src/database/number.rs Normal file
View File

@ -0,0 +1,98 @@
use std::cmp::Ordering;
use std::str::FromStr;
use std::fmt;
use serde_derive::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(f64),
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if let Ok(unsigned) = u64::from_str(s) {
return Ok(Number::Unsigned(unsigned))
}
if let Ok(signed) = i64::from_str(s) {
return Ok(Number::Signed(signed))
}
if let Ok(float) = f64::from_str(s) {
if float == 0.0 || float.is_normal() {
return Ok(Number::Float(float))
}
}
Err(ParseNumberError)
}
}
impl PartialOrd for Number {
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Number {
fn cmp(&self, other: &Number) -> Ordering {
use Number::*;
match (self, other) {
(Unsigned(s), Unsigned(o)) => s.cmp(o),
(Unsigned(s), Signed(o)) => {
let s = i128::from(*s);
let o = i128::from(*o);
s.cmp(&o)
},
(Unsigned(s), Float(o)) => {
let s = *s as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Signed(s), Unsigned(o)) => {
let s = i128::from(*s);
let o = i128::from(*o);
s.cmp(&o)
},
(Signed(s), Signed(o)) => s.cmp(o),
(Signed(s), Float(o)) => {
let s = *s as f64;
s.partial_cmp(o).unwrap_or(Ordering::Equal)
},
(Float(s), Unsigned(o)) => {
let o = *o as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Float(s), Signed(o)) => {
let o = *o as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Float(s), Float(o)) => {
s.partial_cmp(o).unwrap_or(Ordering::Equal)
},
}
}
}
impl PartialEq for Number {
fn eq(&self, other: &Number) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl Eq for Number { }
pub struct ParseNumberError;
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("can not parse number")
}
}

View File

@ -13,8 +13,9 @@ use crate::database::serde::find_id::FindDocumentIdSerializer;
use crate::database::serde::SerializerError; use crate::database::serde::SerializerError;
use crate::DocumentId; use crate::DocumentId;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false }; pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true }; pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true };
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] #[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct SchemaProps { pub struct SchemaProps {
@ -23,6 +24,9 @@ pub struct SchemaProps {
#[serde(default)] #[serde(default)]
indexed: bool, indexed: bool,
#[serde(default)]
ranked: bool,
} }
impl SchemaProps { impl SchemaProps {
@ -33,6 +37,10 @@ impl SchemaProps {
pub fn is_indexed(self) -> bool { pub fn is_indexed(self) -> bool {
self.indexed self.indexed
} }
pub fn is_ranked(self) -> bool {
self.ranked
}
} }
impl BitOr for SchemaProps { impl BitOr for SchemaProps {
@ -42,6 +50,7 @@ impl BitOr for SchemaProps {
SchemaProps { SchemaProps {
stored: self.stored | other.stored, stored: self.stored | other.stored,
indexed: self.indexed | other.indexed, indexed: self.indexed | other.indexed,
ranked: self.ranked | other.ranked,
} }
} }
} }
@ -185,7 +194,8 @@ impl Schema {
} }
} }
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq)] #[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialOrd, Ord, PartialEq, Eq, Hash)]
pub struct SchemaAttr(pub(crate) u16); pub struct SchemaAttr(pub(crate) u16);
impl SchemaAttr { impl SchemaAttr {

View File

@ -17,6 +17,7 @@ macro_rules! forward_to_unserializable_type {
pub mod find_id; pub mod find_id;
pub mod key_to_string; pub mod key_to_string;
pub mod value_to_number;
pub mod serializer; pub mod serializer;
pub mod indexer_serializer; pub mod indexer_serializer;
pub mod deserializer; pub mod deserializer;

View File

@ -5,6 +5,7 @@ use serde::ser;
use crate::database::serde::indexer_serializer::IndexerSerializer; use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer; use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::value_to_number::ValueToNumberSerializer;
use crate::database::update::DocumentUpdate; use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError; use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder; use crate::tokenizer::TokenizerBuilder;
@ -155,8 +156,8 @@ where B: TokenizerBuilder
{ {
Ok(StructSerializer { Ok(StructSerializer {
schema: self.schema, schema: self.schema,
update: self.update,
document_id: self.document_id, document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder, tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words, stop_words: self.stop_words,
}) })
@ -229,6 +230,10 @@ where B: TokenizerBuilder
}; };
value.serialize(serializer)?; value.serialize(serializer)?;
} }
if props.is_ranked() {
let number = value.serialize(ValueToNumberSerializer)?;
self.update.register_ranked_attribute(attr, number)?;
}
} }
Ok(()) Ok(())
@ -276,6 +281,10 @@ where B: TokenizerBuilder
}; };
value.serialize(serializer)?; value.serialize(serializer)?;
} }
if props.is_ranked() {
let integer = value.serialize(ValueToNumberSerializer)?;
self.update.register_ranked_attribute(attr, integer)?;
}
} }
Ok(()) Ok(())

View File

@ -0,0 +1,176 @@
use std::str::FromStr;
use serde::Serialize;
use serde::{ser, ser::Error};
use crate::database::serde::SerializerError;
use crate::database::Number;
pub struct ValueToNumberSerializer;
impl ser::Serializer for ValueToNumberSerializer {
type Ok = Number;
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64))
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64))
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(value as f64))
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(value))
}
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Number::from_str(value).map_err(SerializerError::custom)
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

View File

@ -16,8 +16,9 @@ use crate::tokenizer::TokenizerBuilder;
use crate::data::{DocIds, DocIndexes}; use crate::data::{DocIds, DocIndexes};
use crate::database::schema::Schema; use crate::database::schema::Schema;
use crate::database::index::Index; use crate::database::index::Index;
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
use crate::database::{RankedMap, Number};
use crate::{DocumentId, DocIndex}; use crate::{DocumentId, DocIndex};
use crate::database::DATA_INDEX;
pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec
@ -78,6 +79,7 @@ use UpdateType::{Updated, Deleted};
pub struct RawUpdateBuilder { pub struct RawUpdateBuilder {
documents_update: HashMap<DocumentId, UpdateType>, documents_update: HashMap<DocumentId, UpdateType>,
documents_ranked_fields: RankedMap,
indexed_words: BTreeMap<Token, Vec<DocIndex>>, indexed_words: BTreeMap<Token, Vec<DocIndex>>,
batch: WriteBatch, batch: WriteBatch,
} }
@ -86,6 +88,7 @@ impl RawUpdateBuilder {
pub fn new() -> RawUpdateBuilder { pub fn new() -> RawUpdateBuilder {
RawUpdateBuilder { RawUpdateBuilder {
documents_update: HashMap::new(), documents_update: HashMap::new(),
documents_ranked_fields: HashMap::new(),
indexed_words: BTreeMap::new(), indexed_words: BTreeMap::new(),
batch: WriteBatch::new(), batch: WriteBatch::new(),
} }
@ -137,9 +140,12 @@ impl RawUpdateBuilder {
let index = Index { negative, positive }; let index = Index { negative, positive };
// write the data-index // write the data-index
let mut bytes = Vec::new(); let mut bytes_index = Vec::new();
index.write_to_bytes(&mut bytes); index.write_to_bytes(&mut bytes_index);
self.batch.merge(DATA_INDEX, &bytes)?; self.batch.merge(DATA_INDEX, &bytes_index)?;
let bytes_ranked_map = bincode::serialize(&self.documents_ranked_fields).unwrap();
self.batch.merge(DATA_RANKED_MAP, &bytes_ranked_map)?;
Ok(self.batch) Ok(self.batch)
} }
@ -195,4 +201,23 @@ impl<'a> DocumentUpdate<'a> {
Ok(()) Ok(())
} }
pub fn register_ranked_attribute(
&mut self,
attr: SchemaAttr,
number: Number,
) -> Result<(), SerializerError>
{
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted, ranked attributes cannot be added in the same update"
));
}
self.inner.documents_ranked_fields.insert((self.document_id, attr), number);
Ok(())
}
} }

View File

@ -7,12 +7,13 @@ use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
use serde::de::DeserializeOwned; use serde::de::DeserializeOwned;
use crate::database::{DocumentKey, DocumentKeyAttr}; use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map};
use crate::database::{retrieve_data_schema, retrieve_data_index};
use crate::database::serde::deserializer::Deserializer; use crate::database::serde::deserializer::Deserializer;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::rank::{QueryBuilder, FilterFunc};
use crate::database::schema::Schema; use crate::database::schema::Schema;
use crate::database::index::Index; use crate::database::index::Index;
use crate::rank::{QueryBuilder, FilterFunc}; use crate::database::RankedMap;
use crate::DocumentId; use crate::DocumentId;
pub struct DatabaseView<D> pub struct DatabaseView<D>
@ -20,6 +21,7 @@ where D: Deref<Target=DB>
{ {
snapshot: Snapshot<D>, snapshot: Snapshot<D>,
index: Index, index: Index,
ranked_map: RankedMap,
schema: Schema, schema: Schema,
} }
@ -29,7 +31,8 @@ where D: Deref<Target=DB>
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> { pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
let schema = retrieve_data_schema(&snapshot)?; let schema = retrieve_data_schema(&snapshot)?;
let index = retrieve_data_index(&snapshot)?; let index = retrieve_data_index(&snapshot)?;
Ok(DatabaseView { snapshot, index, schema }) let ranked_map = retrieve_data_ranked_map(&snapshot)?;
Ok(DatabaseView { snapshot, index, ranked_map, schema })
} }
pub fn schema(&self) -> &Schema { pub fn schema(&self) -> &Schema {
@ -40,6 +43,10 @@ where D: Deref<Target=DB>
&self.index &self.index
} }
pub fn ranked_map(&self) -> &RankedMap {
&self.ranked_map
}
pub fn into_snapshot(self) -> Snapshot<D> { pub fn into_snapshot(self) -> Snapshot<D> {
self.snapshot self.snapshot
} }

View File

@ -7,6 +7,8 @@ pub mod rank;
pub mod tokenizer; pub mod tokenizer;
mod common_words; mod common_words;
use serde_derive::{Serialize, Deserialize};
pub use rocksdb; pub use rocksdb;
pub use self::tokenizer::Tokenizer; pub use self::tokenizer::Tokenizer;
@ -16,6 +18,7 @@ pub use self::common_words::CommonWords;
/// ///
/// It is used to inform the database the document you want to deserialize. /// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking. /// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] #[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64); pub struct DocumentId(u64);

View File

@ -4,7 +4,7 @@ mod words_proximity;
mod sum_of_words_attribute; mod sum_of_words_attribute;
mod sum_of_words_position; mod sum_of_words_position;
mod exact; mod exact;
mod sort_by; mod sort_by_attr;
mod document_id; mod document_id;
use std::cmp::Ordering; use std::cmp::Ordering;
@ -17,7 +17,7 @@ pub use self::{
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition, sum_of_words_position::SumOfWordsPosition,
exact::Exact, exact::Exact,
sort_by::SortBy, sort_by_attr::SortByAttr,
document_id::DocumentId, document_id::DocumentId,
}; };

View File

@ -1,83 +0,0 @@
use std::cmp::Ordering;
use std::ops::Deref;
use std::marker;
use rocksdb::DB;
use serde::de::DeserializeOwned;
use crate::rank::criterion::Criterion;
use crate::database::DatabaseView;
use crate::rank::RawDocument;
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
///
/// # Note
///
/// If a document cannot be deserialized it will be considered [`None`][].
///
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
/// so you must check the [`Ord`] of `Option` implementation.
///
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
///
/// # Example
///
/// ```ignore
/// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*;
///
/// #[derive(Deserialize, PartialOrd, Ord, PartialEq, Eq)]
/// struct TimeOnly {
/// time: String,
/// }
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Exact)
/// .add(SortBy::<TimeOnly>::new(&view))
/// .add(DocumentId);
///
/// let criterion = builder.build();
///
/// ```
pub struct SortBy<'a, T, D>
where D: Deref<Target=DB> + Send + Sync,
T: Send + Sync
{
view: &'a DatabaseView<D>,
_phantom: marker::PhantomData<T>,
}
impl<'a, T, D> SortBy<'a, T, D>
where D: Deref<Target=DB> + Send + Sync,
T: Send + Sync
{
pub fn new(view: &'a DatabaseView<D>) -> Self {
SortBy { view, _phantom: marker::PhantomData }
}
}
impl<'a, T, D> Criterion for SortBy<'a, T, D>
where D: Deref<Target=DB> + Send + Sync,
T: DeserializeOwned + Ord + Send + Sync,
{
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = match self.view.document_by_id::<T>(lhs.id) {
Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None },
};
let rhs = match self.view.document_by_id::<T>(rhs.id) {
Ok(doc) => Some(doc),
Err(e) => { eprintln!("{}", e); None },
};
lhs.cmp(&rhs)
}
}

View File

@ -0,0 +1,122 @@
use std::cmp::Ordering;
use std::error::Error;
use std::fmt;
use crate::database::schema::{Schema, SchemaAttr};
use crate::rank::criterion::Criterion;
use crate::database::RankedMap;
use crate::rank::RawDocument;
/// An helper struct that permit to sort documents by
/// some of their stored attributes.
///
/// # Note
///
/// If a document cannot be deserialized it will be considered [`None`][].
///
/// Deserialized documents are compared like `Some(doc0).cmp(&Some(doc1))`,
/// so you must check the [`Ord`] of `Option` implementation.
///
/// [`None`]: https://doc.rust-lang.org/std/option/enum.Option.html#variant.None
/// [`Ord`]: https://doc.rust-lang.org/std/option/enum.Option.html#impl-Ord
///
/// # Example
///
/// ```ignore
/// use serde_derive::Deserialize;
/// use meilidb::rank::criterion::*;
///
/// let custom_ranking = SortByAttr::lower_is_better(&ranked_map, &schema, "published_at")?;
///
/// let builder = CriteriaBuilder::with_capacity(8)
/// .add(SumOfTypos)
/// .add(NumberOfWords)
/// .add(WordsProximity)
/// .add(SumOfWordsAttribute)
/// .add(SumOfWordsPosition)
/// .add(Exact)
/// .add(custom_ranking)
/// .add(DocumentId);
///
/// let criterion = builder.build();
///
/// ```
pub struct SortByAttr<'a> {
ranked_map: &'a RankedMap,
attr: SchemaAttr,
reversed: bool,
}
impl<'a> SortByAttr<'a> {
pub fn lower_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, false)
}
pub fn higher_is_better(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
SortByAttr::new(ranked_map, schema, attr_name, true)
}
fn new(
ranked_map: &'a RankedMap,
schema: &Schema,
attr_name: &str,
reversed: bool,
) -> Result<SortByAttr<'a>, SortByAttrError>
{
let attr = match schema.attribute(attr_name) {
Some(attr) => attr,
None => return Err(SortByAttrError::AttributeNotFound),
};
if schema.props(attr).is_ranked() {
return Err(SortByAttrError::AttributeNotRegisteredForRanking);
}
Ok(SortByAttr { ranked_map, attr, reversed })
}
}
impl<'a> Criterion for SortByAttr<'a> {
fn evaluate(&self, lhs: &RawDocument, rhs: &RawDocument) -> Ordering {
let lhs = self.ranked_map.get(&(lhs.id, self.attr));
let rhs = self.ranked_map.get(&(rhs.id, self.attr));
match (lhs, rhs) {
(Some(lhs), Some(rhs)) => {
let order = lhs.cmp(&rhs);
if self.reversed { order.reverse() } else { order }
},
(None, Some(_)) => Ordering::Greater,
(Some(_), None) => Ordering::Less,
(None, None) => Ordering::Equal,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum SortByAttrError {
AttributeNotFound,
AttributeNotRegisteredForRanking,
}
impl fmt::Display for SortByAttrError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use SortByAttrError::*;
match self {
AttributeNotFound => f.write_str("attribute not found in the schema"),
AttributeNotRegisteredForRanking => f.write_str("attribute not registered for ranking"),
}
}
}
impl Error for SortByAttrError { }