Merge pull request #147 from meilisearch/moving-to-sled

Make the repository a workspace and move to sled
This commit is contained in:
Clément Renault 2019-04-29 15:21:02 +02:00 committed by GitHub
commit d7ce6d016b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
60 changed files with 2314 additions and 3226 deletions

3
.gitignore vendored
View File

@ -1,6 +1,7 @@
/rocksdb
/target /target
/Cargo.lock /Cargo.lock
meilidb/Cargo.lock
meilidb-core/Cargo.lock
**/*.rs.bk **/*.rs.bk
**/*.csv **/*.csv
**/*.json_lines **/*.json_lines

View File

@ -1,55 +1,10 @@
[package] [workspace]
edition = "2018" members = [
name = "meilidb" "meilidb",
version = "0.3.2" "meilidb-core",
authors = ["Kerollmops <renault.cle@gmail.com>"] "meilidb-data",
"meilidb-tokenizer",
[dependencies] ]
arc-swap = "0.3.7"
bincode = "1.1.2"
byteorder = "1.3.1"
fst = "0.3.3"
hashbrown = { version = "0.1.8", features = ["serde"] }
lazy_static = "1.2.0"
levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] }
linked-hash-map = { version = "0.5.1", features = ["serde_impl"] }
lockfree = "0.5.1"
log = "0.4.6"
rayon = "1.0.3"
sdset = "0.3.1"
serde = "1.0.88"
serde_derive = "1.0.88"
serde_json = { version = "1.0.38", features = ["preserve_order"] }
size_format = "1.0.2"
slice-group-by = "0.2.4"
unidecode = "0.3.0"
[dependencies.toml]
git = "https://github.com/Kerollmops/toml-rs.git"
features = ["preserve_order"]
rev = "0372ba6"
[dependencies.rocksdb]
git = "https://github.com/pingcap/rust-rocksdb.git"
rev = "306e201"
[features]
default = ["simd"]
i128 = ["bincode/i128", "byteorder/i128"]
portable = ["rocksdb/portable"]
simd = ["rocksdb/sse"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]
[dev-dependencies]
csv = "1.0.5"
env_logger = "0.6.0"
jemallocator = "0.1.9"
quickcheck = "0.8.2"
rand = "0.6.5"
rand_xorshift = "0.1.1"
structopt = "0.2.14"
tempfile = "3.0.7"
termcolor = "1.0.4"
[profile.release] [profile.release]
debug = true debug = true

View File

@ -1,19 +0,0 @@
# This schema has been generated ...
# The order in which the attributes are declared is important,
# it specify the attribute xxx...
identifier = "id"
[attributes.id]
stored = true
[attributes.title]
stored = true
indexed = true
[attributes.description]
stored = true
indexed = true
[attributes.image]
stored = true

29
meilidb-core/Cargo.toml Normal file
View File

@ -0,0 +1,29 @@
[package]
name = "meilidb-core"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
byteorder = "1.3.1"
hashbrown = "0.2.2"
lazy_static = "1.2.0"
log = "0.4.6"
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
rayon = "1.0.3"
sdset = "0.3.1"
serde = { version = "1.0.88", features = ["derive"] }
slice-group-by = "0.2.4"
[dependencies.fst]
git = "https://github.com/Kerollmops/fst.git"
branch = "arc-byte-slice"
[dependencies.levenshtein_automata]
git = "https://github.com/Kerollmops/levenshtein-automata.git"
branch = "arc-byte-slice"
features = ["fst_automaton"]
[features]
i128 = ["byteorder/i128"]
nightly = ["hashbrown/nightly", "slice-group-by/nightly"]

View File

@ -1,7 +1,6 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use crate::criterion::Criterion;
use crate::rank::criterion::Criterion; use crate::RawDocument;
use crate::rank::RawDocument;
#[derive(Debug, Clone, Copy)] #[derive(Debug, Clone, Copy)]
pub struct DocumentId; pub struct DocumentId;

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::rank::criterion::Criterion; use crate::RawDocument;
use crate::rank::RawDocument;
#[inline] #[inline]
fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize {

View File

@ -4,11 +4,10 @@ mod words_proximity;
mod sum_of_words_attribute; mod sum_of_words_attribute;
mod sum_of_words_position; mod sum_of_words_position;
mod exact; mod exact;
mod sort_by_attr;
mod document_id; mod document_id;
use std::cmp::Ordering; use std::cmp::Ordering;
use crate::rank::RawDocument; use crate::RawDocument;
pub use self::{ pub use self::{
sum_of_typos::SumOfTypos, sum_of_typos::SumOfTypos,
@ -17,7 +16,6 @@ pub use self::{
sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_attribute::SumOfWordsAttribute,
sum_of_words_position::SumOfWordsPosition, sum_of_words_position::SumOfWordsPosition,
exact::Exact, exact::Exact,
sort_by_attr::SortByAttr,
document_id::DocumentId, document_id::DocumentId,
}; };

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::rank::criterion::Criterion; use crate::RawDocument;
use crate::rank::RawDocument;
#[inline] #[inline]
fn number_of_query_words(query_index: &[u32]) -> usize { fn number_of_query_words(query_index: &[u32]) -> usize {

View File

@ -2,8 +2,8 @@ use std::cmp::Ordering;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::rank::criterion::Criterion; use crate::criterion::Criterion;
use crate::rank::RawDocument; use crate::RawDocument;
// This function is a wrong logarithmic 10 function. // This function is a wrong logarithmic 10 function.
// It is safe to panic on input number higher than 3, // It is safe to panic on input number higher than 3,

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::rank::criterion::Criterion; use crate::RawDocument;
use crate::rank::RawDocument;
#[inline] #[inline]
fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize {

View File

@ -1,9 +1,7 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::rank::criterion::Criterion; use crate::RawDocument;
use crate::rank::RawDocument;
#[inline] #[inline]
fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize {

View File

@ -1,9 +1,7 @@
use std::cmp::{self, Ordering}; use std::cmp::{self, Ordering};
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use crate::criterion::Criterion;
use crate::rank::criterion::Criterion; use crate::RawDocument;
use crate::rank::RawDocument;
const MAX_DISTANCE: u16 = 8; const MAX_DISTANCE: u16 = 8;

View File

@ -1,9 +1,9 @@
use std::sync::Arc; use std::sync::Arc;
use std::ops::Deref; use std::ops::Deref;
#[derive(Default, Clone)] #[derive(Clone)]
pub struct SharedData { pub struct SharedData {
pub bytes: Arc<Vec<u8>>, pub bytes: Arc<[u8]>,
pub offset: usize, pub offset: usize,
pub len: usize, pub len: usize,
} }
@ -15,7 +15,7 @@ impl SharedData {
SharedData::new(bytes, 0, len) SharedData::new(bytes, 0, len)
} }
pub fn new(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedData { pub fn new(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedData {
SharedData { bytes, offset, len } SharedData { bytes, offset, len }
} }
@ -33,6 +33,16 @@ impl SharedData {
} }
} }
impl Default for SharedData {
fn default() -> SharedData {
SharedData {
bytes: Arc::from(Vec::new()),
offset: 0,
len: 0,
}
}
}
impl Deref for SharedData { impl Deref for SharedData {
type Target = [u8]; type Target = [u8];

View File

@ -1,15 +1,117 @@
pub mod criterion; pub mod criterion;
pub mod data;
mod index;
mod automaton;
mod query_builder; mod query_builder;
mod distinct_map; mod distinct_map;
pub mod shared_data_cursor;
pub mod write_to_bytes;
use std::sync::Arc; use std::sync::Arc;
use serde::{Serialize, Deserialize};
use slice_group_by::GroupBy; use slice_group_by::GroupBy;
use rayon::slice::ParallelSliceMut; use rayon::slice::ParallelSliceMut;
use crate::{Match, DocumentId}; pub use self::index::{Index, IndexBuilder};
pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder};
pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; /// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(pub u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: 0,
word_index: 0,
is_exact: false,
char_index: 0,
char_length: 0,
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u16::max_value(),
word_index: u16::max_value(),
is_exact: true,
char_index: u16::max_value(),
char_length: u16::max_value(),
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Document { pub struct Document {
@ -181,3 +283,15 @@ impl Matches {
} }
} }
} }
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

View File

@ -1,53 +1,27 @@
use std::{cmp, mem};
use std::ops::Range;
use std::time::Instant;
use std::hash::Hash; use std::hash::Hash;
use std::ops::{Range, Deref};
use std::rc::Rc; use std::rc::Rc;
use std::time::Instant;
use std::{cmp, mem};
use rayon::slice::ParallelSliceMut; use rayon::slice::ParallelSliceMut;
use slice_group_by::{GroupByMut, LinearStrGroupBy}; use slice_group_by::GroupByMut;
use meilidb_tokenizer::{is_cjk, split_query_string};
use hashbrown::{HashMap, HashSet}; use hashbrown::{HashMap, HashSet};
use fst::Streamer; use fst::Streamer;
use log::info; use log::info;
use crate::automaton::{self, DfaExt, AutomatonExt}; use crate::automaton::{self, DfaExt, AutomatonExt};
use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; use crate::distinct_map::{DistinctMap, BufferedDistinctMap};
use crate::rank::criterion::Criteria; use crate::criterion::Criteria;
use crate::database::Index; use crate::{raw_documents_from_matches, RawDocument, Document};
use crate::rank::{raw_documents_from_matches, RawDocument, Document}; use crate::{Index, Match, DocumentId};
use crate::{is_cjk, Match, DocumentId};
#[derive(Debug, PartialEq, Eq)] fn generate_automatons(query: &str) -> Vec<DfaExt> {
enum CharCategory {
Space,
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if c.is_whitespace() { CharCategory::Space }
else if is_cjk(c) { CharCategory::Cjk }
else { CharCategory::Other }
}
fn is_word(s: &&str) -> bool {
!s.chars().any(char::is_whitespace)
}
fn same_group_category(a: char, b: char) -> bool {
let ca = classify_char(a);
let cb = classify_char(b);
if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb }
}
fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace);
let mut groups = LinearStrGroupBy::new(query, same_group_category) let mut groups = split_query_string(query).map(str::to_lowercase).peekable();
.filter(is_word)
.map(str::to_lowercase)
.peekable();
let mut automatons = Vec::new(); let mut automatons = Vec::new();
while let Some(word) = groups.next() { while let Some(word) = groups.next() {
let has_following_word = groups.peek().is_some(); let has_following_word = groups.peek().is_some();
let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) {
@ -61,28 +35,26 @@ fn split_whitespace_automatons(query: &str) -> Vec<DfaExt> {
automatons automatons
} }
pub type FilterFunc = fn(DocumentId) -> bool; pub struct QueryBuilder<'c, I, FI = fn(DocumentId) -> bool> {
index: I,
pub struct QueryBuilder<'i, 'c, FI> {
index: &'i Index,
criteria: Criteria<'c>, criteria: Criteria<'c>,
searchable_attrs: Option<HashSet<u16>>, searchable_attrs: Option<HashSet<u16>>,
filter: Option<FI>, filter: Option<FI>,
} }
impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> { impl<'c, I> QueryBuilder<'c, I, fn(DocumentId) -> bool> {
pub fn new(index: &'i Index) -> Self { pub fn new(index: I) -> Self {
QueryBuilder::with_criteria(index, Criteria::default()) QueryBuilder::with_criteria(index, Criteria::default())
} }
pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self { pub fn with_criteria(index: I, criteria: Criteria<'c>) -> Self {
QueryBuilder { index, criteria, searchable_attrs: None, filter: None } QueryBuilder { index, criteria, searchable_attrs: None, filter: None }
} }
} }
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> impl<'c, I, FI> QueryBuilder<'c, I, FI>
{ {
pub fn with_filter<F>(self, function: F) -> QueryBuilder<'i, 'c, F> pub fn with_filter<F>(self, function: F) -> QueryBuilder<'c, I, F>
where F: Fn(DocumentId) -> bool, where F: Fn(DocumentId) -> bool,
{ {
QueryBuilder { QueryBuilder {
@ -93,7 +65,7 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
} }
} }
pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F> pub fn with_distinct<F, K>(self, function: F, size: usize) -> DistinctQueryBuilder<'c, I, FI, F>
where F: Fn(DocumentId) -> Option<K>, where F: Fn(DocumentId) -> Option<K>,
K: Hash + Eq, K: Hash + Eq,
{ {
@ -108,9 +80,13 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new); let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new);
attributes.insert(attribute); attributes.insert(attribute);
} }
}
impl<'c, I, FI> QueryBuilder<'c, I, FI>
where I: Deref<Target=Index>,
{
fn query_all(&self, query: &str) -> Vec<RawDocument> { fn query_all(&self, query: &str) -> Vec<RawDocument> {
let automatons = split_whitespace_automatons(query); let automatons = generate_automatons(query);
let mut stream = { let mut stream = {
let mut op_builder = fst::map::OpBuilder::new(); let mut op_builder = fst::map::OpBuilder::new();
@ -118,7 +94,7 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
let stream = self.index.map.search(automaton); let stream = self.index.map.search(automaton);
op_builder.push(stream); op_builder.push(stream);
} }
op_builder.union() op_builder.r#union()
}; };
let mut matches = Vec::new(); let mut matches = Vec::new();
@ -159,8 +135,9 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI>
} }
} }
impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> impl<'c, I, FI> QueryBuilder<'c, I, FI>
where FI: Fn(DocumentId) -> bool, where I: Deref<Target=Index>,
FI: Fn(DocumentId) -> bool,
{ {
pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> { pub fn query(self, query: &str, range: Range<usize>) -> Vec<Document> {
// We delegate the filter work to the distinct query builder, // We delegate the filter work to the distinct query builder,
@ -212,15 +189,15 @@ where FI: Fn(DocumentId) -> bool,
} }
} }
pub struct DistinctQueryBuilder<'i, 'c, FI, FD> { pub struct DistinctQueryBuilder<'c, I, FI, FD> {
inner: QueryBuilder<'i, 'c, FI>, inner: QueryBuilder<'c, I, FI>,
function: FD, function: FD,
size: usize, size: usize,
} }
impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD> impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD>
{ {
pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD> pub fn with_filter<F>(self, function: F) -> DistinctQueryBuilder<'c, I, F, FD>
where F: Fn(DocumentId) -> bool, where F: Fn(DocumentId) -> bool,
{ {
DistinctQueryBuilder { DistinctQueryBuilder {
@ -235,8 +212,9 @@ impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD>
} }
} }
impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD> impl<'c, I, FI, FD, K> DistinctQueryBuilder<'c, I, FI, FD>
where FI: Fn(DocumentId) -> bool, where I: Deref<Target=Index>,
FI: Fn(DocumentId) -> bool,
FD: Fn(DocumentId) -> Option<K>, FD: Fn(DocumentId) -> Option<K>,
K: Hash + Eq, K: Hash + Eq,
{ {

View File

@ -7,12 +7,12 @@ pub struct SharedDataCursor(Cursor<SharedData>);
impl SharedDataCursor { impl SharedDataCursor {
pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor { pub fn from_bytes(bytes: Vec<u8>) -> SharedDataCursor {
let len = bytes.len(); let len = bytes.len();
let bytes = Arc::new(bytes); let bytes = Arc::from(bytes);
SharedDataCursor::from_shared_bytes(bytes, 0, len) SharedDataCursor::from_shared_bytes(bytes, 0, len)
} }
pub fn from_shared_bytes(bytes: Arc<Vec<u8>>, offset: usize, len: usize) -> SharedDataCursor { pub fn from_shared_bytes(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedDataCursor {
let data = SharedData::new(bytes, offset, len); let data = SharedData::new(bytes, offset, len);
let cursor = Cursor::new(data); let cursor = Cursor::new(data);

25
meilidb-data/Cargo.toml Normal file
View File

@ -0,0 +1,25 @@
[package]
name = "meilidb-data"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
arc-swap = "0.3.11"
bincode = "1.1.2"
byteorder = "1.3.1"
hashbrown = { version = "0.2.2", features = ["serde"] }
linked-hash-map = { version = "0.5.2", features = ["serde_impl"] }
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
ordered-float = { version = "1.0.2", features = ["serde"] }
sdset = "0.3.1"
serde = { version = "1.0.90", features = ["derive"] }
serde_json = { version = "1.0.39", features = ["preserve_order"] }
sled = "0.23.0"
toml = { version = "0.5.0", features = ["preserve_order"] }
deunicode = "1.0.0"
[dependencies.rmp-serde]
git = "https://github.com/3Hren/msgpack-rust.git"
rev = "40b3d48"

View File

@ -0,0 +1,464 @@
use std::collections::HashSet;
use std::io::{self, Cursor, BufRead};
use std::iter::FromIterator;
use std::path::Path;
use std::sync::Arc;
use std::{error, fmt};
use arc_swap::{ArcSwap, Lease};
use byteorder::{ReadBytesExt, BigEndian};
use hashbrown::HashMap;
use meilidb_core::criterion::Criteria;
use meilidb_core::QueryBuilder;
use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor};
use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::{DocumentId, Index as WordIndex};
use rmp_serde::decode::{Error as RmpError};
use sdset::SetBuf;
use serde::de;
use sled::IVec;
use crate::{Schema, SchemaAttr, RankedMap};
use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError};
use crate::indexer::Indexer;
#[derive(Debug)]
pub enum Error {
SchemaDiffer,
SchemaMissing,
WordIndexMissing,
MissingDocumentId,
SledError(sled::Error),
BincodeError(bincode::Error),
SerializerError(SerializerError),
}
impl From<sled::Error> for Error {
fn from(error: sled::Error) -> Error {
Error::SledError(error)
}
}
impl From<bincode::Error> for Error {
fn from(error: bincode::Error) -> Error {
Error::BincodeError(error)
}
}
impl From<SerializerError> for Error {
fn from(error: SerializerError) -> Error {
Error::SerializerError(error)
}
}
impl fmt::Display for Error {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
use self::Error::*;
match self {
SchemaDiffer => write!(f, "schemas differ"),
SchemaMissing => write!(f, "this index does not have a schema"),
WordIndexMissing => write!(f, "this index does not have a word index"),
MissingDocumentId => write!(f, "document id is missing"),
SledError(e) => write!(f, "sled error; {}", e),
BincodeError(e) => write!(f, "bincode error; {}", e),
SerializerError(e) => write!(f, "serializer error; {}", e),
}
}
}
impl error::Error for Error { }
fn index_name(name: &str) -> Vec<u8> {
format!("index-{}", name).into_bytes()
}
fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec<u8> {
let DocumentId(document_id) = id;
let SchemaAttr(schema_attr) = attr;
let mut bytes = Vec::new();
bytes.extend_from_slice(b"document-");
bytes.extend_from_slice(&document_id.to_be_bytes()[..]);
bytes.extend_from_slice(&schema_attr.to_be_bytes()[..]);
bytes
}
trait CursorExt {
fn consume_if_eq(&mut self, needle: &[u8]) -> bool;
}
impl<T: AsRef<[u8]>> CursorExt for Cursor<T> {
fn consume_if_eq(&mut self, needle: &[u8]) -> bool {
let position = self.position() as usize;
let slice = self.get_ref().as_ref();
if slice[position..].starts_with(needle) {
self.consume(needle.len());
true
} else {
false
}
}
}
fn extract_document_key(key: Vec<u8>) -> io::Result<(DocumentId, SchemaAttr)> {
let mut key = Cursor::new(key);
if !key.consume_if_eq(b"document-") {
return Err(io::Error::from(io::ErrorKind::InvalidData))
}
let document_id = key.read_u64::<BigEndian>().map(DocumentId)?;
let schema_attr = key.read_u16::<BigEndian>().map(SchemaAttr)?;
Ok((document_id, schema_attr))
}
#[derive(Clone)]
pub struct Database {
opened: Arc<ArcSwap<HashMap<String, RawIndex>>>,
inner: sled::Db,
}
impl Database {
pub fn start_default<P: AsRef<Path>>(path: P) -> Result<Database, Error> {
let inner = sled::Db::start_default(path)?;
let opened = Arc::new(ArcSwap::new(Arc::new(HashMap::new())));
Ok(Database { opened, inner })
}
pub fn open_index(&self, name: &str) -> Result<Option<Index>, Error> {
// check if the index was already opened
if let Some(raw_index) = self.opened.lease().get(name) {
return Ok(Some(Index(raw_index.clone())))
}
let raw_name = index_name(name);
if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) {
let tree = self.inner.open_tree(raw_name)?;
let raw_index = RawIndex::from_raw(tree)?;
self.opened.rcu(|opened| {
let mut opened = HashMap::clone(opened);
opened.insert(name.to_string(), raw_index.clone());
opened
});
return Ok(Some(Index(raw_index)))
}
Ok(None)
}
pub fn create_index(&self, name: String, schema: Schema) -> Result<Index, Error> {
match self.open_index(&name)? {
Some(index) => {
if index.schema() != &schema {
return Err(Error::SchemaDiffer);
}
Ok(index)
},
None => {
let raw_name = index_name(&name);
let tree = self.inner.open_tree(raw_name)?;
let raw_index = RawIndex::new_from_raw(tree, schema)?;
self.opened.rcu(|opened| {
let mut opened = HashMap::clone(opened);
opened.insert(name.clone(), raw_index.clone());
opened
});
Ok(Index(raw_index))
},
}
}
}
#[derive(Clone)]
pub struct RawIndex {
schema: Schema,
word_index: Arc<ArcSwap<WordIndex>>,
ranked_map: Arc<ArcSwap<RankedMap>>,
inner: Arc<sled::Tree>,
}
impl RawIndex {
fn from_raw(inner: Arc<sled::Tree>) -> Result<RawIndex, Error> {
let schema = {
let bytes = inner.get("schema")?;
let bytes = bytes.ok_or(Error::SchemaMissing)?;
Schema::read_from_bin(bytes.as_ref())?
};
let bytes = inner.get("word-index")?;
let bytes = bytes.ok_or(Error::WordIndexMissing)?;
let word_index = {
let len = bytes.len();
let bytes: Arc<[u8]> = Into::into(bytes);
let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len);
// TODO must handle this error
let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap();
Arc::new(ArcSwap::new(Arc::new(word_index)))
};
let ranked_map = {
let map = match inner.get("ranked-map")? {
Some(bytes) => bincode::deserialize(bytes.as_ref())?,
None => RankedMap::default(),
};
Arc::new(ArcSwap::new(Arc::new(map)))
};
Ok(RawIndex { schema, word_index, ranked_map, inner })
}
fn new_from_raw(inner: Arc<sled::Tree>, schema: Schema) -> Result<RawIndex, Error> {
let mut schema_bytes = Vec::new();
schema.write_to_bin(&mut schema_bytes)?;
inner.set("schema", schema_bytes)?;
let word_index = WordIndex::default();
inner.set("word-index", word_index.into_bytes())?;
let word_index = Arc::new(ArcSwap::new(Arc::new(word_index)));
let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default())));
Ok(RawIndex { schema, word_index, ranked_map, inner })
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn word_index(&self) -> Lease<Arc<WordIndex>> {
self.word_index.lease()
}
pub fn ranked_map(&self) -> Lease<Arc<RankedMap>> {
self.ranked_map.lease()
}
pub fn update_word_index(&self, word_index: Arc<WordIndex>) -> sled::Result<()> {
let data = word_index.into_bytes();
self.inner.set("word-index", data).map(drop)?;
self.word_index.store(word_index);
Ok(())
}
pub fn update_ranked_map(&self, ranked_map: Arc<RankedMap>) -> sled::Result<()> {
let data = bincode::serialize(ranked_map.as_ref()).unwrap();
self.inner.set("ranked-map", data).map(drop)?;
self.ranked_map.store(ranked_map);
Ok(())
}
pub fn set_document_attribute<V>(
&self,
id: DocumentId,
attr: SchemaAttr,
value: V,
) -> Result<Option<IVec>, sled::Error>
where IVec: From<V>,
{
let key = document_key(id, attr);
Ok(self.inner.set(key, value)?)
}
pub fn get_document_attribute(
&self,
id: DocumentId,
attr: SchemaAttr
) -> Result<Option<IVec>, sled::Error>
{
let key = document_key(id, attr);
Ok(self.inner.get(key)?)
}
pub fn get_document_fields(&self, id: DocumentId) -> DocumentFieldsIter {
let start = document_key(id, SchemaAttr::min());
let end = document_key(id, SchemaAttr::max());
DocumentFieldsIter(self.inner.range(start..=end))
}
pub fn del_document_attribute(
&self,
id: DocumentId,
attr: SchemaAttr
) -> Result<Option<IVec>, sled::Error>
{
let key = document_key(id, attr);
Ok(self.inner.del(key)?)
}
}
pub struct DocumentFieldsIter<'a>(sled::Iter<'a>);
impl<'a> Iterator for DocumentFieldsIter<'a> {
type Item = Result<(DocumentId, SchemaAttr, IVec), Error>;
fn next(&mut self) -> Option<Self::Item> {
match self.0.next() {
Some(Ok((key, value))) => {
let (id, attr) = extract_document_key(key).unwrap();
Some(Ok((id, attr, value)))
},
Some(Err(e)) => Some(Err(Error::SledError(e))),
None => None,
}
}
}
#[derive(Clone)]
pub struct Index(RawIndex);
impl Index {
pub fn query_builder(&self) -> QueryBuilder<Lease<Arc<WordIndex>>> {
let word_index = self.word_index();
QueryBuilder::new(word_index)
}
pub fn query_builder_with_criteria<'c>(
&self,
criteria: Criteria<'c>,
) -> QueryBuilder<'c, Lease<Arc<WordIndex>>>
{
let word_index = self.word_index();
QueryBuilder::with_criteria(word_index, criteria)
}
pub fn schema(&self) -> &Schema {
self.0.schema()
}
pub fn word_index(&self) -> Lease<Arc<WordIndex>> {
self.0.word_index()
}
pub fn ranked_map(&self) -> Lease<Arc<RankedMap>> {
self.0.ranked_map()
}
pub fn documents_addition(&self) -> DocumentsAddition {
let index = self.0.clone();
let ranked_map = self.0.ranked_map().clone();
DocumentsAddition::from_raw(index, ranked_map)
}
pub fn documents_deletion(&self) -> DocumentsDeletion {
let index = self.0.clone();
DocumentsDeletion::from_raw(index)
}
pub fn document<T>(
&self,
fields: Option<&HashSet<&str>>,
id: DocumentId,
) -> Result<Option<T>, RmpError>
where T: de::DeserializeOwned,
{
let fields = match fields {
Some(fields) => {
let iter = fields.iter().filter_map(|n| self.0.schema().attribute(n));
Some(HashSet::from_iter(iter))
},
None => None,
};
let mut deserializer = Deserializer {
document_id: id,
raw_index: &self.0,
fields: fields.as_ref(),
};
// TODO: currently we return an error if all document fields are missing,
// returning None would have been better
T::deserialize(&mut deserializer).map(Some)
}
}
pub struct DocumentsAddition {
inner: RawIndex,
indexer: Indexer,
ranked_map: RankedMap,
}
impl DocumentsAddition {
pub fn from_raw(inner: RawIndex, ranked_map: RankedMap) -> DocumentsAddition {
DocumentsAddition { inner, indexer: Indexer::new(), ranked_map }
}
pub fn update_document<D>(&mut self, document: D) -> Result<(), Error>
where D: serde::Serialize,
{
let schema = self.inner.schema();
let identifier = schema.identifier_name();
let document_id = match extract_document_id(identifier, &document)? {
Some(id) => id,
None => return Err(Error::MissingDocumentId),
};
let serializer = Serializer {
schema,
index: &self.inner,
indexer: &mut self.indexer,
ranked_map: &mut self.ranked_map,
document_id,
};
document.serialize(serializer)?;
Ok(())
}
pub fn finalize(self) -> sled::Result<()> {
let delta_index = self.indexer.build();
let index = self.inner.word_index();
let new_index = index.r#union(&delta_index);
let new_index = Arc::from(new_index);
self.inner.update_word_index(new_index)?;
Ok(())
}
}
pub struct DocumentsDeletion {
inner: RawIndex,
documents: Vec<DocumentId>,
}
impl DocumentsDeletion {
pub fn from_raw(inner: RawIndex) -> DocumentsDeletion {
DocumentsDeletion {
inner,
documents: Vec::new(),
}
}
pub fn delete_document(&mut self, id: DocumentId) {
self.documents.push(id);
}
pub fn finalize(mut self) -> Result<(), Error> {
self.documents.sort_unstable();
self.documents.dedup();
let idset = SetBuf::new_unchecked(self.documents);
let index = self.inner.word_index();
let new_index = index.remove_documents(&idset);
let new_index = Arc::from(new_index);
self.inner.update_word_index(new_index)?;
Ok(())
}
}

View File

@ -0,0 +1,45 @@
use std::error::Error;
use byteorder::{ReadBytesExt, WriteBytesExt};
use meilidb_core::{Index as WordIndex};
use meilidb_core::data::DocIds;
use meilidb_core::write_to_bytes::WriteToBytes;
use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
enum NewIndexEvent<'a> {
RemovedDocuments(&'a DocIds),
UpdatedDocuments(&'a WordIndex),
}
impl<'a> WriteToBytes for NewIndexEvent<'a> {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
match self {
NewIndexEvent::RemovedDocuments(doc_ids) => {
let _ = bytes.write_u8(0);
doc_ids.write_to_bytes(bytes);
},
NewIndexEvent::UpdatedDocuments(index) => {
let _ = bytes.write_u8(1);
index.write_to_bytes(bytes);
}
}
}
}
enum IndexEvent {
RemovedDocuments(DocIds),
UpdatedDocuments(WordIndex),
}
impl FromSharedDataCursor for IndexEvent {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
match cursor.read_u8()? {
0 => DocIds::from_shared_data_cursor(cursor).map(IndexEvent::RemovedDocuments),
1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments),
_ => Err("invalid index event type".into()),
}
}
}

117
meilidb-data/src/indexer.rs Normal file
View File

@ -0,0 +1,117 @@
use std::collections::BTreeMap;
use std::convert::TryFrom;
use deunicode::deunicode_with_tofu;
use meilidb_core::{DocumentId, DocIndex};
use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder};
use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token};
use sdset::Set;
use crate::SchemaAttr;
type Word = Vec<u8>; // TODO make it be a SmallVec
pub struct Indexer {
word_limit: usize, // the maximum number of indexed words
indexed: BTreeMap<Word, Vec<DocIndex>>,
}
impl Indexer {
pub fn new() -> Indexer {
Indexer {
word_limit: 1000,
indexed: BTreeMap::new(),
}
}
pub fn with_word_limit(limit: usize) -> Indexer {
Indexer {
word_limit: limit,
indexed: BTreeMap::new(),
}
}
pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) {
for token in Tokenizer::new(text) {
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
if !must_continue { break }
}
}
pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I)
where I: IntoIterator<Item=&'a str>,
{
let iter = iter.into_iter();
for token in SeqTokenizer::new(iter) {
let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed);
if !must_continue { break }
}
}
pub fn build(self) -> WordIndex {
let mut builder = WordIndexBuilder::new();
for (key, mut indexes) in self.indexed {
indexes.sort_unstable();
indexes.dedup();
let indexes = Set::new_unchecked(&indexes);
builder.insert(key, indexes).unwrap();
}
builder.build()
}
}
fn index_token(
token: Token,
id: DocumentId,
attr: SchemaAttr,
word_limit: usize,
indexed: &mut BTreeMap<Word, Vec<DocIndex>>,
) -> bool
{
if token.word_index >= word_limit { return false }
let lower = token.word.to_lowercase();
let token = Token { word: &lower, ..token };
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
},
None => return false,
}
if !lower.contains(is_cjk) {
let unidecoded = deunicode_with_tofu(&lower, "");
if unidecoded != lower {
let token = Token { word: &unidecoded, ..token };
match token_to_docindex(id, attr, token) {
Some(docindex) => {
let word = Vec::from(token.word);
indexed.entry(word).or_insert_with(Vec::new).push(docindex);
},
None => return false,
}
}
}
true
}
fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option<DocIndex> {
let word_index = u16::try_from(token.word_index).ok()?;
let char_index = u16::try_from(token.char_index).ok()?;
let char_length = u16::try_from(token.word.chars().count()).ok()?;
let docindex = DocIndex {
document_id: id,
attribute: attr.0,
word_index: word_index,
char_index: char_index,
char_length: char_length,
};
Some(docindex)
}

12
meilidb-data/src/lib.rs Normal file
View File

@ -0,0 +1,12 @@
mod database;
mod index_event;
mod indexer;
mod number;
mod ranked_map;
mod serde;
pub mod schema;
pub use self::database::{Database, Index};
pub use self::number::Number;
pub use self::ranked_map::RankedMap;
pub use self::schema::{Schema, SchemaAttr};

View File

@ -0,0 +1,55 @@
use std::num::{ParseIntError, ParseFloatError};
use std::str::FromStr;
use std::fmt;
use ordered_float::OrderedFloat;
use serde::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(OrderedFloat<f64>),
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
let uint_error = match u64::from_str(s) {
Ok(unsigned) => return Ok(Number::Unsigned(unsigned)),
Err(error) => error,
};
let int_error = match i64::from_str(s) {
Ok(signed) => return Ok(Number::Signed(signed)),
Err(error) => error,
};
let float_error = match f64::from_str(s) {
Ok(float) => return Ok(Number::Float(OrderedFloat(float))),
Err(error) => error,
};
Err(ParseNumberError { uint_error, int_error, float_error })
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseNumberError {
uint_error: ParseIntError,
int_error: ParseIntError,
float_error: ParseFloatError,
}
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
if self.uint_error == self.int_error {
write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error)
} else {
write!(f, "can not parse number: {}, {}, {}",
self.uint_error, self.int_error, self.float_error)
}
}
}

View File

@ -0,0 +1,5 @@
use hashbrown::HashMap;
use meilidb_core::DocumentId;
use crate::{SchemaAttr, Number};
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;

View File

@ -5,13 +5,9 @@ use std::{fmt, u16};
use std::ops::BitOr; use std::ops::BitOr;
use std::sync::Arc; use std::sync::Arc;
use serde_derive::{Serialize, Deserialize}; use serde::{Serialize, Deserialize};
use linked_hash_map::LinkedHashMap; use linked_hash_map::LinkedHashMap;
use crate::database::serde::find_id::FindDocumentIdSerializer;
use crate::database::serde::SerializerError;
use crate::DocumentId;
pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false }; pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false };
pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false }; pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false };
pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true }; pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true };
@ -166,14 +162,6 @@ impl Schema {
attributes attributes
} }
pub fn document_id<T>(&self, document: T) -> Result<DocumentId, SerializerError>
where T: serde::Serialize,
{
let id_attribute_name = &self.inner.identifier;
let serializer = FindDocumentIdSerializer { id_attribute_name };
document.serialize(serializer)
}
pub fn props(&self, attr: SchemaAttr) -> SchemaProps { pub fn props(&self, attr: SchemaAttr) -> SchemaProps {
let (_, props) = self.inner.props[attr.0 as usize]; let (_, props) = self.inner.props[attr.0 as usize];
props props

View File

@ -1,12 +1,16 @@
use serde::Serialize; use std::str::FromStr;
use ordered_float::OrderedFloat;
use serde::ser; use serde::ser;
use serde::Serialize;
use crate::database::serde::SerializerError; use super::SerializerError;
use crate::Number;
pub struct KeyToStringSerializer; pub struct ConvertToNumber;
impl ser::Serializer for KeyToStringSerializer { impl ser::Serializer for ConvertToNumber {
type Ok = String; type Ok = Number;
type Error = SerializerError; type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>; type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
@ -16,48 +20,78 @@ impl ser::Serializer for KeyToStringSerializer {
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! { fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
bool => serialize_bool, Ok(Number::Unsigned(u64::from(value)))
char => serialize_char, }
i8 => serialize_i8, fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
i16 => serialize_i16, Err(SerializerError::UnrankableType { type_name: "char" })
i32 => serialize_i32, }
i64 => serialize_i64,
u8 => serialize_u8, fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
u16 => serialize_u16, Ok(Number::Signed(i64::from(value)))
u32 => serialize_u32, }
u64 => serialize_u64,
f32 => serialize_f32, fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
f64 => serialize_f64, Ok(Number::Signed(i64::from(value)))
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(i64::from(value)))
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value))
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(u64::from(value)))
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value))
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(value as f64)))
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(OrderedFloat(value)))
} }
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string()) Ok(Number::from_str(value)?)
} }
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> { fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" }) Err(SerializerError::UnrankableType { type_name: "&[u8]" })
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" }) Err(SerializerError::UnrankableType { type_name: "Option" })
} }
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize, where T: Serialize,
{ {
Err(SerializerError::UnserializableType { name: "Option" }) Err(SerializerError::UnrankableType { type_name: "Option" })
} }
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" }) Err(SerializerError::UnrankableType { type_name: "()" })
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" }) Err(SerializerError::UnrankableType { type_name: "unit struct" })
} }
fn serialize_unit_variant( fn serialize_unit_variant(
@ -67,7 +101,7 @@ impl ser::Serializer for KeyToStringSerializer {
_variant: &'static str _variant: &'static str
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "unit variant" }) Err(SerializerError::UnrankableType { type_name: "unit variant" })
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
@ -89,15 +123,15 @@ impl ser::Serializer for KeyToStringSerializer {
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where T: Serialize,
{ {
Err(SerializerError::UnserializableType { name: "newtype variant" }) Err(SerializerError::UnrankableType { type_name: "newtype variant" })
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" }) Err(SerializerError::UnrankableType { type_name: "sequence" })
} }
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" }) Err(SerializerError::UnrankableType { type_name: "tuple" })
} }
fn serialize_tuple_struct( fn serialize_tuple_struct(
@ -106,7 +140,7 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "tuple struct" }) Err(SerializerError::UnrankableType { type_name: "tuple struct" })
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -117,11 +151,11 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "tuple variant" }) Err(SerializerError::UnrankableType { type_name: "tuple variant" })
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" }) Err(SerializerError::UnrankableType { type_name: "map" })
} }
fn serialize_struct( fn serialize_struct(
@ -130,7 +164,7 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "struct" }) Err(SerializerError::UnrankableType { type_name: "struct" })
} }
fn serialize_struct_variant( fn serialize_struct_variant(
@ -141,6 +175,6 @@ impl ser::Serializer for KeyToStringSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "struct variant" }) Err(SerializerError::UnrankableType { type_name: "struct variant" })
} }
} }

View File

@ -1,15 +1,12 @@
use std::str::FromStr;
use serde::Serialize; use serde::Serialize;
use serde::{ser, ser::Error}; use serde::ser;
use crate::database::serde::SerializerError; use super::SerializerError;
use crate::database::Number;
pub struct ValueToNumberSerializer; pub struct ConvertToString;
impl ser::Serializer for ValueToNumberSerializer { impl ser::Serializer for ConvertToString {
type Ok = Number; type Ok = String;
type Error = SerializerError; type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>; type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
@ -19,75 +16,78 @@ impl ser::Serializer for ValueToNumberSerializer {
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! { fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
bool => serialize_bool, Err(SerializerError::UnserializableType { type_name: "boolean" })
char => serialize_char, }
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
Ok(value.to_string())
} }
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> { fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64)) Ok(value.to_string())
} }
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> { fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64)) Ok(value.to_string())
} }
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> { fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64)) Ok(value.to_string())
} }
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> { fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Signed(value as i64)) Ok(value.to_string())
} }
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> { fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64)) Ok(value.to_string())
} }
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> { fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64)) Ok(value.to_string())
} }
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> { fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64)) Ok(value.to_string())
} }
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> { fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Unsigned(value as u64)) Ok(value.to_string())
} }
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> { fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(value as f64)) Ok(value.to_string())
} }
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> { fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
Ok(Number::Float(value)) Ok(value.to_string())
} }
fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Number::from_str(value).map_err(SerializerError::custom) Ok(value.to_string())
} }
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> { fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" }) Err(SerializerError::UnserializableType { type_name: "&[u8]" })
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" }) Err(SerializerError::UnserializableType { type_name: "Option" })
} }
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize, where T: Serialize,
{ {
Err(SerializerError::UnserializableType { name: "Option" }) Err(SerializerError::UnserializableType { type_name: "Option" })
} }
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" }) Err(SerializerError::UnserializableType { type_name: "()" })
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" }) Err(SerializerError::UnserializableType { type_name: "unit struct" })
} }
fn serialize_unit_variant( fn serialize_unit_variant(
@ -97,7 +97,7 @@ impl ser::Serializer for ValueToNumberSerializer {
_variant: &'static str _variant: &'static str
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "unit variant" }) Err(SerializerError::UnserializableType { type_name: "unit variant" })
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
@ -119,15 +119,15 @@ impl ser::Serializer for ValueToNumberSerializer {
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where T: Serialize,
{ {
Err(SerializerError::UnserializableType { name: "newtype variant" }) Err(SerializerError::UnserializableType { type_name: "newtype variant" })
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" }) Err(SerializerError::UnserializableType { type_name: "sequence" })
} }
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" }) Err(SerializerError::UnserializableType { type_name: "tuple" })
} }
fn serialize_tuple_struct( fn serialize_tuple_struct(
@ -136,7 +136,7 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "tuple struct" }) Err(SerializerError::UnserializableType { type_name: "tuple struct" })
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -147,11 +147,11 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "tuple variant" }) Err(SerializerError::UnserializableType { type_name: "tuple variant" })
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" }) Err(SerializerError::UnserializableType { type_name: "map" })
} }
fn serialize_struct( fn serialize_struct(
@ -160,7 +160,7 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "struct" }) Err(SerializerError::UnserializableType { type_name: "struct" })
} }
fn serialize_struct_variant( fn serialize_struct_variant(
@ -171,6 +171,6 @@ impl ser::Serializer for ValueToNumberSerializer {
_len: usize _len: usize
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "struct variant" }) Err(SerializerError::UnserializableType { type_name: "struct variant" })
} }
} }

View File

@ -0,0 +1,97 @@
use std::collections::HashSet;
use std::io::Cursor;
use meilidb_core::DocumentId;
use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader};
use rmp_serde::decode::{Error as RmpError};
use serde::{de, forward_to_deserialize_any};
use crate::database::RawIndex;
use crate::SchemaAttr;
pub struct Deserializer<'a> {
pub document_id: DocumentId,
pub raw_index: &'a RawIndex,
pub fields: Option<&'a HashSet<SchemaAttr>>,
}
impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a>
{
type Error = RmpError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq
bytes byte_buf unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
let document_attributes = self.raw_index.get_document_fields(self.document_id);
let document_attributes = document_attributes.filter_map(|result| {
match result {
Ok(value) => Some(value),
Err(e) => {
// TODO: must log the error
// error!("sled iter error; {}", e);
None
},
}
});
let iter = document_attributes.filter_map(|(_, attr, value)| {
if self.fields.map_or(true, |f| f.contains(&attr)) {
let attribute_name = self.raw_index.schema().attribute_name(attr);
Some((attribute_name, Value::new(value)))
} else {
None
}
});
let map_deserializer = de::value::MapDeserializer::new(iter);
visitor.visit_map(map_deserializer)
}
}
struct Value<A>(RmpDeserializer<ReadReader<Cursor<A>>>) where A: AsRef<[u8]>;
impl<A> Value<A> where A: AsRef<[u8]>
{
fn new(value: A) -> Value<A> {
Value(RmpDeserializer::new(Cursor::new(value)))
}
}
impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value<A>
where A: AsRef<[u8]>,
{
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
impl<'de, 'a, A> de::Deserializer<'de> for Value<A>
where A: AsRef<[u8]>,
{
type Error = RmpError;
fn deserialize_any<V>(mut self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
self.0.deserialize_any(visitor)
}
forward_to_deserialize_any! {
bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string
bytes byte_buf option unit unit_struct newtype_struct seq tuple
tuple_struct map struct enum identifier ignored_any
}
}

View File

@ -1,23 +1,41 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use meilidb_core::DocumentId;
use serde::Serialize; use serde::Serialize;
use serde::ser; use serde::ser;
use crate::database::serde::key_to_string::KeyToStringSerializer; use super::{SerializerError, ConvertToString};
use crate::database::serde::{SerializerError, calculate_hash};
use crate::DocumentId;
pub struct FindDocumentIdSerializer<'a> { pub fn extract_document_id<D>(
pub id_attribute_name: &'a str, identifier: &str,
document: &D,
) -> Result<Option<DocumentId>, SerializerError>
where D: serde::Serialize,
{
let serializer = ExtractDocumentId { identifier };
document.serialize(serializer)
} }
impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { fn calculate_hash<T: Hash>(t: &T) -> u64 {
type Ok = DocumentId; let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
struct ExtractDocumentId<'a> {
identifier: &'a str,
}
impl<'a> ser::Serializer for ExtractDocumentId<'a> {
type Ok = Option<DocumentId>;
type Error = SerializerError; type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>; type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>; type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = FindDocumentIdMapSerializer<'a>; type SerializeMap = ExtractDocumentIdMapSerializer<'a>;
type SerializeStruct = FindDocumentIdStructSerializer<'a>; type SerializeStruct = ExtractDocumentIdStructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>; type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! { forward_to_unserializable_type! {
@ -38,30 +56,30 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
f64 => serialize_f64, f64 => serialize_f64,
} }
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> { fn serialize_str(self, value: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" }) Err(SerializerError::UnserializableType { type_name: "str" })
} }
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> { fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" }) Err(SerializerError::UnserializableType { type_name: "&[u8]" })
} }
fn serialize_none(self) -> Result<Self::Ok, Self::Error> { fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" }) Err(SerializerError::UnserializableType { type_name: "Option" })
} }
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error> fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize, where T: Serialize,
{ {
Err(SerializerError::UnserializableType { name: "Option" }) Err(SerializerError::UnserializableType { type_name: "Option" })
} }
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> { fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" }) Err(SerializerError::UnserializableType { type_name: "()" })
} }
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> { fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" }) Err(SerializerError::UnserializableType { type_name: "unit struct" })
} }
fn serialize_unit_variant( fn serialize_unit_variant(
@ -71,7 +89,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_variant: &'static str _variant: &'static str
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "unit variant" }) Err(SerializerError::UnserializableType { type_name: "unit variant" })
} }
fn serialize_newtype_struct<T: ?Sized>( fn serialize_newtype_struct<T: ?Sized>(
@ -93,15 +111,15 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
) -> Result<Self::Ok, Self::Error> ) -> Result<Self::Ok, Self::Error>
where T: Serialize, where T: Serialize,
{ {
Err(SerializerError::UnserializableType { name: "newtype variant" }) Err(SerializerError::UnserializableType { type_name: "newtype variant" })
} }
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> { fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" }) Err(SerializerError::UnserializableType { type_name: "sequence" })
} }
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> { fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" }) Err(SerializerError::UnserializableType { type_name: "tuple" })
} }
fn serialize_tuple_struct( fn serialize_tuple_struct(
@ -110,7 +128,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize _len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error> ) -> Result<Self::SerializeTupleStruct, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "tuple struct" }) Err(SerializerError::UnserializableType { type_name: "tuple struct" })
} }
fn serialize_tuple_variant( fn serialize_tuple_variant(
@ -121,15 +139,17 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize _len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error> ) -> Result<Self::SerializeTupleVariant, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "tuple variant" }) Err(SerializerError::UnserializableType { type_name: "tuple variant" })
} }
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> { fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(FindDocumentIdMapSerializer { let serializer = ExtractDocumentIdMapSerializer {
id_attribute_name: self.id_attribute_name, identifier: self.identifier,
document_id: None, document_id: None,
current_key_name: None, current_key_name: None,
}) };
Ok(serializer)
} }
fn serialize_struct( fn serialize_struct(
@ -138,10 +158,12 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize _len: usize
) -> Result<Self::SerializeStruct, Self::Error> ) -> Result<Self::SerializeStruct, Self::Error>
{ {
Ok(FindDocumentIdStructSerializer { let serializer = ExtractDocumentIdStructSerializer {
id_attribute_name: self.id_attribute_name, identifier: self.identifier,
document_id: None, document_id: None,
}) };
Ok(serializer)
} }
fn serialize_struct_variant( fn serialize_struct_variant(
@ -152,24 +174,24 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> {
_len: usize _len: usize
) -> Result<Self::SerializeStructVariant, Self::Error> ) -> Result<Self::SerializeStructVariant, Self::Error>
{ {
Err(SerializerError::UnserializableType { name: "struct variant" }) Err(SerializerError::UnserializableType { type_name: "struct variant" })
} }
} }
pub struct FindDocumentIdMapSerializer<'a> { pub struct ExtractDocumentIdMapSerializer<'a> {
id_attribute_name: &'a str, identifier: &'a str,
document_id: Option<DocumentId>, document_id: Option<DocumentId>,
current_key_name: Option<String>, current_key_name: Option<String>,
} }
impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> { impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> {
type Ok = DocumentId; type Ok = Option<DocumentId>;
type Error = SerializerError; type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error> fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize, where T: Serialize,
{ {
let key = key.serialize(KeyToStringSerializer)?; let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key); self.current_key_name = Some(key);
Ok(()) Ok(())
} }
@ -188,9 +210,9 @@ impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
) -> Result<(), Self::Error> ) -> Result<(), Self::Error>
where K: Serialize, V: Serialize, where K: Serialize, V: Serialize,
{ {
let key = key.serialize(KeyToStringSerializer)?; let key = key.serialize(ConvertToString)?;
if self.id_attribute_name == key { if self.identifier == key {
// TODO is it possible to have multiple ids? // TODO is it possible to have multiple ids?
let id = bincode::serialize(value).unwrap(); let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id); let hash = calculate_hash(&id);
@ -201,20 +223,17 @@ impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> {
} }
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id { Ok(self.document_id)
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
} }
} }
pub struct FindDocumentIdStructSerializer<'a> { pub struct ExtractDocumentIdStructSerializer<'a> {
id_attribute_name: &'a str, identifier: &'a str,
document_id: Option<DocumentId>, document_id: Option<DocumentId>,
} }
impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> {
type Ok = DocumentId; type Ok = Option<DocumentId>;
type Error = SerializerError; type Error = SerializerError;
fn serialize_field<T: ?Sized>( fn serialize_field<T: ?Sized>(
@ -224,7 +243,7 @@ impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
) -> Result<(), Self::Error> ) -> Result<(), Self::Error>
where T: Serialize, where T: Serialize,
{ {
if self.id_attribute_name == key { if self.identifier == key {
// TODO can it be possible to have multiple ids? // TODO can it be possible to have multiple ids?
let id = bincode::serialize(value).unwrap(); let id = bincode::serialize(value).unwrap();
let hash = calculate_hash(&id); let hash = calculate_hash(&id);
@ -235,9 +254,6 @@ impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> {
} }
fn end(self) -> Result<Self::Ok, Self::Error> { fn end(self) -> Result<Self::Ok, Self::Error> {
match self.document_id { Ok(self.document_id)
Some(document_id) => Ok(document_id),
None => Err(SerializerError::DocumentIdNotFound)
}
} }
} }

View File

@ -0,0 +1,337 @@
use meilidb_core::DocumentId;
use serde::ser;
use serde::Serialize;
use crate::database::RawIndex;
use crate::indexer::Indexer as RawIndexer;
use crate::schema::SchemaAttr;
use super::{SerializerError, ConvertToString};
pub struct Indexer<'a> {
pub attribute: SchemaAttr,
pub indexer: &'a mut RawIndexer,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Indexer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = SeqIndexer<'a>;
type SerializeTuple = TupleIndexer<'a>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapIndexer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
fn serialize_bool(self, value: bool) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "boolean" })
}
fn serialize_char(self, value: char) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i8(self, value: i8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i16(self, value: i16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i32(self, value: i32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_i64(self, value: i64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u8(self, value: u8) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u16(self, value: u16) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u32(self, value: u32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_u64(self, value: u64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f32(self, value: f32) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_f64(self, value: f64) -> Result<Self::Ok, Self::Error> {
let text = value.serialize(ConvertToString)?;
self.serialize_str(&text)
}
fn serialize_str(self, text: &str) -> Result<Self::Ok, Self::Error> {
self.indexer.index_text(self.document_id, self.attribute, text);
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.indexer.index_text(self.document_id, self.attribute, &text);
Ok(())
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnindexableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnindexableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
let indexer = SeqIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
let indexer = TupleIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
let indexer = MapIndexer {
attribute: self.attribute,
document_id: self.document_id,
indexer: self.indexer,
texts: Vec::new(),
};
Ok(indexer)
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnindexableType { type_name: "struct variant" })
}
}
pub struct SeqIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeSeq for SeqIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct MapIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeMap for MapIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = key.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct StructSerializer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key_text = key.to_owned();
let value_text = value.serialize(ConvertToString)?;
self.texts.push(key_text);
self.texts.push(value_text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}
pub struct TupleIndexer<'a> {
attribute: SchemaAttr,
document_id: DocumentId,
indexer: &'a mut RawIndexer,
texts: Vec<String>,
}
impl<'a> ser::SerializeTuple for TupleIndexer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_element<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize
{
let text = value.serialize(ConvertToString)?;
self.texts.push(text);
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
let texts = self.texts.iter().map(String::as_str);
self.indexer.index_text_seq(self.document_id, self.attribute, texts);
Ok(())
}
}

View File

@ -0,0 +1,97 @@
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "$ty" })
}
)*
}
}
mod convert_to_number;
mod convert_to_string;
mod deserializer;
mod extract_document_id;
mod indexer;
mod serializer;
pub use self::deserializer::Deserializer;
pub use self::extract_document_id::extract_document_id;
pub use self::convert_to_string::ConvertToString;
pub use self::convert_to_number::ConvertToNumber;
pub use self::indexer::Indexer;
pub use self::serializer::Serializer;
use std::{fmt, error::Error};
use rmp_serde::encode::Error as RmpError;
use serde::ser;
use crate::number::ParseNumberError;
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
RmpError(RmpError),
SledError(sled::Error),
ParseNumberError(ParseNumberError),
UnserializableType { type_name: &'static str },
UnindexableType { type_name: &'static str },
UnrankableType { type_name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
}
SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e),
SerializerError::SledError(e) => write!(f, "sled related error: {}", e),
SerializerError::ParseNumberError(e) => {
write!(f, "error while trying to parse a number: {}", e)
},
SerializerError::UnserializableType { type_name } => {
write!(f, "{} are not a serializable type", type_name)
},
SerializerError::UnindexableType { type_name } => {
write!(f, "{} are not an indexable type", type_name)
},
SerializerError::UnrankableType { type_name } => {
write!(f, "{} types can not be used for ranking", type_name)
},
SerializerError::Custom(s) => f.write_str(s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}
impl From<RmpError> for SerializerError {
fn from(error: RmpError) -> SerializerError {
SerializerError::RmpError(error)
}
}
impl From<sled::Error> for SerializerError {
fn from(error: sled::Error) -> SerializerError {
SerializerError::SledError(error)
}
}
impl From<ParseNumberError> for SerializerError {
fn from(error: ParseNumberError) -> SerializerError {
SerializerError::ParseNumberError(error)
}
}

View File

@ -0,0 +1,295 @@
use meilidb_core::DocumentId;
use serde::ser;
use crate::database::RawIndex;
use crate::ranked_map::RankedMap;
use crate::indexer::Indexer as RawIndexer;
use crate::schema::{Schema, SchemaAttr};
use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer};
pub struct Serializer<'a> {
pub schema: &'a Schema,
pub index: &'a RawIndex,
pub indexer: &'a mut RawIndexer,
pub ranked_map: &'a mut RankedMap,
pub document_id: DocumentId,
}
impl<'a> ser::Serializer for Serializer<'a> {
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a>;
type SerializeStruct = StructSerializer<'a>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: ser::Serialize,
{
Err(SerializerError::UnserializableType { type_name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { type_name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
index: self.index,
indexer: self.indexer,
ranked_map: self.ranked_map,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
document_id: self.document_id,
index: self.index,
indexer: self.indexer,
ranked_map: self.ranked_map,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { type_name: "struct variant" })
}
}
pub struct MapSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
index: &'a RawIndex,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
current_key_name: Option<String>,
}
impl<'a> ser::SerializeMap for MapSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: ser::Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: ser::Serialize, V: ser::Serialize,
{
let key = key.serialize(ConvertToString)?;
serialize_value(
self.schema,
self.document_id,
self.index,
self.indexer,
self.ranked_map,
&key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a> {
schema: &'a Schema,
document_id: DocumentId,
index: &'a RawIndex,
indexer: &'a mut RawIndexer,
ranked_map: &'a mut RankedMap,
}
impl<'a> ser::SerializeStruct for StructSerializer<'a> {
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T,
) -> Result<(), Self::Error>
where T: ser::Serialize,
{
serialize_value(
self.schema,
self.document_id,
self.index,
self.indexer,
self.ranked_map,
key,
value,
)
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
fn serialize_value<T: ?Sized>(
schema: &Schema,
document_id: DocumentId,
index: &RawIndex,
indexer: &mut RawIndexer,
ranked_map: &mut RankedMap,
key: &str,
value: &T,
) -> Result<(), SerializerError>
where T: ser::Serialize,
{
if let Some(attr) = schema.attribute(key) {
let props = schema.props(attr);
if props.is_stored() {
let value = rmp_serde::to_vec_named(value)?;
index.set_document_attribute(document_id, attr, value)?;
}
if props.is_indexed() {
let indexer = Indexer {
attribute: attr,
indexer: indexer,
document_id: document_id,
};
value.serialize(indexer)?;
}
if props.is_ranked() {
let key = (document_id, attr);
let number = value.serialize(ConvertToNumber)?;
ranked_map.insert(key, number);
}
}
Ok(())
}

View File

@ -0,0 +1,8 @@
[package]
name = "meilidb-tokenizer"
version = "0.1.0"
authors = ["Kerollmops <renault.cle@gmail.com>"]
edition = "2018"
[dependencies]
slice-group-by = "0.2.4"

View File

@ -0,0 +1,295 @@
use std::iter::Peekable;
use slice_group_by::StrGroupBy;
use self::SeparatorCategory::*;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum SeparatorCategory {
Soft,
Hard,
}
impl SeparatorCategory {
fn merge(self, other: SeparatorCategory) -> SeparatorCategory {
if let (Soft, Soft) = (self, other) { Soft } else { Hard }
}
fn to_usize(self) -> usize {
match self {
Soft => 1,
Hard => 8,
}
}
}
fn is_separator(c: char) -> bool {
classify_separator(c).is_some()
}
fn classify_separator(c: char) -> Option<SeparatorCategory> {
match c {
' ' | '\'' | '"' => Some(Soft),
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Hard),
_ => None,
}
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
enum CharCategory {
Separator(SeparatorCategory),
Cjk,
Other,
}
fn classify_char(c: char) -> CharCategory {
if let Some(category) = classify_separator(c) {
CharCategory::Separator(category)
} else if is_cjk(c) {
CharCategory::Cjk
} else {
CharCategory::Other
}
}
fn is_str_word(s: &str) -> bool {
!s.chars().any(is_separator)
}
fn same_group_category(a: char, b: char) -> bool {
match (classify_char(a), classify_char(b)) {
(CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false,
(CharCategory::Separator(_), CharCategory::Separator(_)) => true,
(a, b) => a == b,
}
}
// fold the number of chars along with the index position
fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) {
(n + 1, i + c.len_utf8())
}
pub fn split_query_string(query: &str) -> impl Iterator<Item=&str> {
Tokenizer::new(query).map(|t| t.word)
}
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
pub struct Tokenizer<'a> {
inner: &'a str,
word_index: usize,
char_index: usize,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
// skip every separator and set `char_index`
// to the number of char trimmed
let (count, index) = string.char_indices()
.take_while(|(_, c)| is_separator(*c))
.fold((0, 0), chars_count_index);
Tokenizer {
inner: &string[index..],
word_index: 0,
char_index: count,
}
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut iter = self.inner.linear_group_by(same_group_category).peekable();
while let (Some(string), next_string) = (iter.next(), iter.peek()) {
let (count, index) = string.char_indices().fold((0, 0), chars_count_index);
if !is_str_word(string) {
self.word_index += string.chars()
.filter_map(classify_separator)
.fold(Soft, |a, x| a.merge(x))
.to_usize();
self.char_index += count;
self.inner = &self.inner[index..];
continue;
}
let token = Token {
word: string,
word_index: self.word_index,
char_index: self.char_index,
};
if next_string.filter(|s| is_str_word(s)).is_some() {
self.word_index += 1;
}
self.char_index += count;
self.inner = &self.inner[index..];
return Some(token);
}
self.inner = "";
None
}
}
pub struct SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
inner: I,
current: Option<Peekable<Tokenizer<'a>>>,
word_offset: usize,
char_offset: usize,
}
impl<'a, I> SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
pub fn new(mut iter: I) -> SeqTokenizer<'a, I> {
let current = iter.next().map(|s| Tokenizer::new(s).peekable());
SeqTokenizer {
inner: iter,
current: current,
word_offset: 0,
char_offset: 0,
}
}
}
impl<'a, I> Iterator for SeqTokenizer<'a, I>
where I: Iterator<Item=&'a str>,
{
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
match &mut self.current {
Some(current) => {
match current.next() {
Some(token) => {
// we must apply the word and char offsets
// to the token before returning it
let token = Token {
word: token.word,
word_index: token.word_index + self.word_offset,
char_index: token.char_index + self.char_offset,
};
// if this is the last iteration on this text
// we must save the offsets for next texts
if current.peek().is_none() {
let hard_space = SeparatorCategory::Hard.to_usize();
self.word_offset = token.word_index + hard_space;
self.char_offset = token.char_index + hard_space;
}
Some(token)
},
None => {
// no more words in this text we must
// start tokenizing the next text
self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable());
self.next()
},
}
},
// no more texts available
None => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
assert_eq!(tokenizer.next(), None);
}
}

27
meilidb/Cargo.toml Normal file
View File

@ -0,0 +1,27 @@
[package]
edition = "2018"
name = "meilidb"
version = "0.3.1"
authors = ["Kerollmops <renault.cle@gmail.com>"]
[dependencies]
meilidb-core = { path = "../meilidb-core", version = "0.1.0" }
meilidb-data = { path = "../meilidb-data", version = "0.1.0" }
meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" }
[features]
default = []
i128 = ["meilidb-core/i128"]
nightly = ["meilidb-core/nightly"]
[dev-dependencies]
csv = "1.0.7"
env_logger = "0.6.1"
jemallocator = "0.1.9"
quickcheck = "0.8.2"
rand = "0.6.5"
rand_xorshift = "0.1.1"
serde = { version = "1.0.90", features = ["derive"] }
structopt = "0.2.15"
tempfile = "3.0.7"
termcolor = "1.0.4"

View File

@ -9,11 +9,10 @@ use std::error::Error;
use std::borrow::Cow; use std::borrow::Cow;
use std::fs::File; use std::fs::File;
use serde_derive::{Serialize, Deserialize}; use serde::{Serialize, Deserialize};
use structopt::StructOpt; use structopt::StructOpt;
use meilidb::database::{Database, Schema}; use meilidb_data::{Database, Schema};
use meilidb::tokenizer::DefaultBuilder;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
pub struct Opt { pub struct Opt {
@ -51,9 +50,9 @@ fn index(
stop_words: &HashSet<String>, stop_words: &HashSet<String>,
) -> Result<Database, Box<Error>> ) -> Result<Database, Box<Error>>
{ {
let database = Database::create(database_path)?; let database = Database::start_default(database_path)?;
database.create_index("default", &schema)?; let index = database.create_index("default".to_string(), schema.clone())?;
let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut rdr = csv::Reader::from_path(csv_data_path)?;
let mut raw_record = csv::StringRecord::new(); let mut raw_record = csv::StringRecord::new();
@ -63,8 +62,7 @@ fn index(
let mut end_of_file = false; let mut end_of_file = false;
while !end_of_file { while !end_of_file {
let tokenizer_builder = DefaultBuilder::new(); let mut update = index.documents_addition();
let mut update = database.start_update("default")?;
loop { loop {
end_of_file = !rdr.read_record(&mut raw_record)?; end_of_file = !rdr.read_record(&mut raw_record)?;
@ -78,7 +76,7 @@ fn index(
} }
}; };
update.update_document(&document, &tokenizer_builder, &stop_words)?; update.update_document(&document)?;
print!("\rindexing document {}", i); print!("\rindexing document {}", i);
i += 1; i += 1;
@ -91,7 +89,7 @@ fn index(
println!(); println!();
println!("committing update..."); println!("committing update...");
database.commit_update(update)?; update.finalize()?;
} }
Ok(database) Ok(database)

View File

@ -2,19 +2,19 @@
static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc;
use std::collections::btree_map::{BTreeMap, Entry}; use std::collections::btree_map::{BTreeMap, Entry};
use std::collections::{HashMap, HashSet};
use std::iter::FromIterator; use std::iter::FromIterator;
use std::io::{self, Write}; use std::io::{self, Write};
use std::time::Instant; use std::time::{Instant, Duration};
use std::path::PathBuf; use std::path::PathBuf;
use std::error::Error; use std::error::Error;
use hashbrown::{HashMap, HashSet};
use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor};
use structopt::StructOpt; use structopt::StructOpt;
use meilidb_core::Match;
use meilidb::database::schema::SchemaAttr; use meilidb_data::schema::SchemaAttr;
use meilidb::database::Database; use meilidb_data::Database;
use meilidb::Match;
#[derive(Debug, StructOpt)] #[derive(Debug, StructOpt)]
pub struct Opt { pub struct Opt {
@ -138,12 +138,19 @@ fn main() -> Result<(), Box<Error>> {
let opt = Opt::from_args(); let opt = Opt::from_args();
let start = Instant::now(); let start = Instant::now();
let database = Database::open(&opt.database_path)?; let database = Database::start_default(&opt.database_path)?;
println!("database prepared for you in {:.2?}", start.elapsed());
let mut buffer = String::new(); let mut buffer = String::new();
let input = io::stdin(); let input = io::stdin();
let index = database.open_index("default")?.unwrap();
let schema = index.schema();
println!("database prepared for you in {:.2?}", start.elapsed());
let fields = opt.displayed_fields.iter().map(String::as_str);
let fields = HashSet::from_iter(fields);
loop { loop {
print!("Searching for: "); print!("Searching for: ");
io::stdout().flush()?; io::stdout().flush()?;
@ -151,32 +158,28 @@ fn main() -> Result<(), Box<Error>> {
if input.read_line(&mut buffer)? == 0 { break } if input.read_line(&mut buffer)? == 0 { break }
let query = buffer.trim_end_matches('\n'); let query = buffer.trim_end_matches('\n');
let view = database.view("default")?; let start_total = Instant::now();
let schema = view.schema();
let start = Instant::now(); let builder = index.query_builder();
let builder = view.query_builder();
let documents = builder.query(query, 0..opt.number_results); let documents = builder.query(query, 0..opt.number_results);
let mut retrieve_duration = Duration::default();
let number_of_documents = documents.len(); let number_of_documents = documents.len();
for mut doc in documents { for mut doc in documents {
doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index)); doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index));
match view.document_by_id::<Document>(doc.id) { let start_retrieve = Instant::now();
Ok(document) => { let result = index.document::<Document>(Some(&fields), doc.id);
for name in &opt.displayed_fields { retrieve_duration += start_retrieve.elapsed();
let attr = match schema.attribute(name) {
Some(attr) => attr,
None => continue,
};
let text = match document.get(name) {
Some(text) => text,
None => continue,
};
match result {
Ok(Some(document)) => {
for (name, text) in document {
print!("{}: ", name); print!("{}: ", name);
let attr = schema.attribute(&name).unwrap();
let matches = doc.matches.iter() let matches = doc.matches.iter()
.filter(|m| SchemaAttr::new(m.attribute) == attr) .filter(|m| SchemaAttr::new(m.attribute) == attr)
.cloned(); .cloned();
@ -186,6 +189,7 @@ fn main() -> Result<(), Box<Error>> {
println!(); println!();
} }
}, },
Ok(None) => eprintln!("missing document"),
Err(e) => eprintln!("{}", e), Err(e) => eprintln!("{}", e),
} }
@ -202,7 +206,8 @@ fn main() -> Result<(), Box<Error>> {
println!(); println!();
} }
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed()); eprintln!("document field retrieve took {:.2?}", retrieve_duration);
eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed());
buffer.clear(); buffer.clear();
} }

7
meilidb/src/lib.rs Normal file
View File

@ -0,0 +1,7 @@
#![cfg_attr(feature = "nightly", feature(test))]
mod common_words;
mod sort_by_attr;
pub use self::sort_by_attr::SortByAttr;
pub use self::common_words::CommonWords;

View File

@ -2,10 +2,9 @@ use std::cmp::Ordering;
use std::error::Error; use std::error::Error;
use std::fmt; use std::fmt;
use crate::database::schema::{Schema, SchemaAttr}; use meilidb_core::criterion::Criterion;
use crate::rank::criterion::Criterion; use meilidb_core::RawDocument;
use crate::database::RankedMap; use meilidb_data::{Schema, SchemaAttr, RankedMap};
use crate::rank::RawDocument;
/// An helper struct that permit to sort documents by /// An helper struct that permit to sort documents by
/// some of their stored attributes. /// some of their stored attributes.

View File

@ -1,46 +0,0 @@
use std::collections::{HashSet, HashMap};
use serde_derive::{Serialize, Deserialize};
#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum RankingOrdering {
Asc,
Dsc
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct AccessToken {
pub read_key: String,
pub write_key: String,
pub admin_key: String,
}
#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Config {
pub stop_words: Option<HashSet<String>>,
pub ranking_order: Option<Vec<String>>,
pub distinct_field: Option<String>,
pub ranking_rules: Option<HashMap<String, RankingOrdering>>,
pub access_token: Option<AccessToken>,
}
impl Config {
pub fn update_with(&mut self, new: Config) {
if let Some(stop_words) = new.stop_words {
self.stop_words = Some(stop_words);
};
if let Some(ranking_order) = new.ranking_order {
self.ranking_order = Some(ranking_order);
};
if let Some(distinct_field) = new.distinct_field {
self.distinct_field = Some(distinct_field);
};
if let Some(ranking_rules) = new.ranking_rules {
self.ranking_rules = Some(ranking_rules);
};
if let Some(access_token) = new.access_token {
self.access_token = Some(access_token);
};
}
}

View File

@ -1,149 +0,0 @@
use std::io::{Cursor, Read, Write};
use std::mem::size_of;
use std::fmt;
use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt};
use crate::database::schema::SchemaAttr;
use crate::DocumentId;
const DOC_KEY_LEN: usize = 4 + size_of::<u64>();
const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::<u16>();
#[derive(Copy, Clone)]
pub struct DocumentKey([u8; DOC_KEY_LEN]);
impl DocumentKey {
pub fn new(id: DocumentId) -> DocumentKey {
let mut buffer = [0; DOC_KEY_LEN];
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(b"doc-").unwrap();
wtr.write_u64::<BigEndian>(id.0).unwrap();
DocumentKey(buffer)
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKey {
assert!(bytes.len() >= DOC_KEY_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKey(buffer)
}
pub fn with_attribute(&self, attr: SchemaAttr) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), attr)
}
pub fn with_attribute_min(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::min())
}
pub fn with_attribute_max(&self) -> DocumentKeyAttr {
DocumentKeyAttr::new(self.document_id(), SchemaAttr::max())
}
pub fn document_id(&self) -> DocumentId {
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
}
impl AsRef<[u8]> for DocumentKey {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKey {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKey")
.field("document_id", &self.document_id())
.finish()
}
}
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub struct DocumentKeyAttr([u8; DOC_KEY_ATTR_LEN]);
impl DocumentKeyAttr {
pub fn new(id: DocumentId, attr: SchemaAttr) -> DocumentKeyAttr {
let mut buffer = [0; DOC_KEY_ATTR_LEN];
let DocumentKey(raw_key) = DocumentKey::new(id);
let mut wtr = Cursor::new(&mut buffer[..]);
wtr.write_all(&raw_key).unwrap();
wtr.write_all(b"-").unwrap();
wtr.write_u16::<BigEndian>(attr.0).unwrap();
DocumentKeyAttr(buffer)
}
pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::min())
}
pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr {
DocumentKeyAttr::new(id, SchemaAttr::max())
}
pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr {
assert!(bytes.len() >= DOC_KEY_ATTR_LEN);
assert_eq!(&bytes[..4], b"doc-");
let mut buffer = [0; DOC_KEY_ATTR_LEN];
bytes.read_exact(&mut buffer).unwrap();
DocumentKeyAttr(buffer)
}
pub fn document_id(&self) -> DocumentId {
let id = (&self.0[4..]).read_u64::<BigEndian>().unwrap();
DocumentId(id)
}
pub fn attribute(&self) -> SchemaAttr {
let offset = 4 + size_of::<u64>() + 1;
let value = (&self.0[offset..]).read_u16::<BigEndian>().unwrap();
SchemaAttr::new(value)
}
pub fn into_document_key(self) -> DocumentKey {
DocumentKey::new(self.document_id())
}
}
impl AsRef<[u8]> for DocumentKeyAttr {
fn as_ref(&self) -> &[u8] {
&self.0
}
}
impl fmt::Debug for DocumentKeyAttr {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.debug_struct("DocumentKeyAttr")
.field("document_id", &self.document_id())
.field("attribute", &self.attribute().0)
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn keep_as_ref_order() {
for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) {
let id = DocumentId(0);
let a = DocumentKeyAttr::new(id, SchemaAttr(a));
let b = DocumentKeyAttr::new(id, SchemaAttr(b));
assert!(a < b);
assert!(a.as_ref() < b.as_ref());
}
}
}

View File

@ -1,911 +0,0 @@
use std::time::Instant;
use std::error::Error;
use std::ffi::OsStr;
use std::sync::Arc;
use std::fs;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicBool, Ordering};
use std::ops::{Deref, DerefMut};
use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{Writable, Snapshot};
use rocksdb::{DB, MergeOperands};
use size_format::SizeFormatterBinary;
use arc_swap::ArcSwap;
use lockfree::map::Map;
use hashbrown::HashMap;
use log::{info, error, warn};
use crate::database::schema::SchemaAttr;
use crate::shared_data_cursor::FromSharedDataCursor;
use crate::write_to_bytes::WriteToBytes;
use crate::DocumentId;
use self::update::{ReadIndexEvent, ReadRankedMapEvent};
pub use self::config::Config;
pub use self::document_key::{DocumentKey, DocumentKeyAttr};
pub use self::view::{DatabaseView, DocumentIter};
pub use self::update::Update;
pub use self::serde::SerializerError;
pub use self::schema::Schema;
pub use self::index::Index;
pub use self::number::{Number, ParseNumberError};
pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>;
const DATA_INDEX: &[u8] = b"data-index";
const DATA_RANKED_MAP: &[u8] = b"data-ranked-map";
const DATA_SCHEMA: &[u8] = b"data-schema";
const CONFIG: &[u8] = b"config";
pub mod config;
pub mod schema;
pub(crate) mod index;
mod number;
mod document_key;
mod serde;
mod update;
mod view;
fn retrieve_data_schema<D>(snapshot: &Snapshot<D>) -> Result<Schema, Box<Error>>
where D: Deref<Target=DB>
{
match snapshot.get(DATA_SCHEMA)? {
Some(vector) => Ok(Schema::read_from_bin(&*vector)?),
None => Err(String::from("BUG: no schema found in the database").into()),
}
}
fn retrieve_data_index<D>(snapshot: &Snapshot<D>) -> Result<Index, Box<Error>>
where D: Deref<Target=DB>
{
let start = Instant::now();
let vector = snapshot.get(DATA_INDEX)?;
info!("loading index from kv-store took {:.2?}", start.elapsed());
match vector {
Some(vector) => {
let start = Instant::now();
let bytes = vector.as_ref().to_vec();
info!("index size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
let event = ReadIndexEvent::from_bytes(bytes)?;
let index = event.updated_documents().expect("BUG: invalid event deserialized");
info!("loading index from bytes took {:.2?}", start.elapsed());
Ok(index)
},
None => Ok(Index::default()),
}
}
fn retrieve_data_ranked_map<D>(snapshot: &Snapshot<D>) -> Result<RankedMap, Box<Error>>
where D: Deref<Target=DB>,
{
let start = Instant::now();
let vector = snapshot.get(DATA_RANKED_MAP)?;
info!("loading ranked map from kv-store took {:.2?}", start.elapsed());
match vector {
Some(vector) => {
let start = Instant::now();
let bytes = vector.as_ref().to_vec();
info!("ranked map size is {}B", SizeFormatterBinary::new(bytes.len() as u64));
let event = ReadRankedMapEvent::from_bytes(bytes)?;
let ranked_map = event.updated_documents().expect("BUG: invalid event deserialized");
info!("loading ranked map from bytes took {:.2?}", start.elapsed());
Ok(ranked_map)
},
None => Ok(RankedMap::new()),
}
}
fn retrieve_config<D>(snapshot: &Snapshot<D>) -> Result<Config, Box<Error>>
where D: Deref<Target=DB>,
{
match snapshot.get(CONFIG)? {
Some(vector) => Ok(bincode::deserialize(&*vector)?),
None => Ok(Config::default()),
}
}
fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
use self::update::ReadIndexEvent::{self, *};
use self::update::WriteIndexEvent;
let mut index = Index::default();
for bytes in existing.into_iter().chain(operands) {
match ReadIndexEvent::from_bytes(bytes.to_vec()).unwrap() {
RemovedDocuments(d) => index = index.remove_documents(d.as_ref()),
UpdatedDocuments(i) => index = index.union(&i),
}
}
WriteIndexEvent::UpdatedDocuments(&index).into_bytes()
}
fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
use self::update::ReadRankedMapEvent::{self, *};
use self::update::WriteRankedMapEvent;
let mut ranked_map = RankedMap::default();
for bytes in existing.into_iter().chain(operands) {
match ReadRankedMapEvent::from_bytes(bytes.to_vec()).unwrap() {
RemovedDocuments(d) => ranked_map.retain(|(k, _), _| !d.as_ref().binary_search(k).is_ok()),
UpdatedDocuments(i) => ranked_map.extend(i),
}
}
WriteRankedMapEvent::UpdatedDocuments(&ranked_map).into_bytes()
}
fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec<u8> {
match key {
DATA_INDEX => merge_indexes(existing, operands),
DATA_RANKED_MAP => merge_ranked_maps(existing, operands),
key => panic!("The merge operator does not support merging {:?}", key),
}
}
pub struct IndexUpdate {
index: String,
update: Update,
}
impl Deref for IndexUpdate {
type Target = Update;
fn deref(&self) -> &Update {
&self.update
}
}
impl DerefMut for IndexUpdate {
fn deref_mut(&mut self) -> &mut Update {
&mut self.update
}
}
struct DatabaseIndex {
db: Arc<DB>,
// This view is updated each time the DB ingests an update.
view: ArcSwap<DatabaseView<Arc<DB>>>,
// The path of the mdb folder stored on disk.
path: PathBuf,
// must_die false by default, must be set as true when the Index is dropped.
// It is used to erase the folder saved on disk when the user request to delete an index.
must_die: AtomicBool,
}
impl DatabaseIndex {
fn create<P: AsRef<Path>>(path: P, schema: &Schema) -> Result<DatabaseIndex, Box<Error>> {
let path = path.as_ref();
if path.exists() {
return Err(format!("File already exists at path: {}, cannot create database.",
path.display()).into())
}
let path_lossy = path.to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(true);
// opts.error_if_exists(true); // FIXME pull request that
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data merge operator", merge_operator);
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
let mut schema_bytes = Vec::new();
schema.write_to_bin(&mut schema_bytes)?;
db.put(DATA_SCHEMA, &schema_bytes)?;
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(DatabaseIndex {
db: db,
view: view,
path: path.to_path_buf(),
must_die: AtomicBool::new(false)
})
}
fn open<P: AsRef<Path>>(path: P) -> Result<DatabaseIndex, Box<Error>> {
let path_lossy = path.as_ref().to_string_lossy();
let mut opts = DBOptions::new();
opts.create_if_missing(false);
let mut cf_opts = ColumnFamilyOptions::new();
cf_opts.add_merge_operator("data merge operator", merge_operator);
let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?;
// FIXME create a generic function to do that !
let _schema = match db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from_bin(&*value)?,
None => return Err(String::from("Database does not contain a schema").into()),
};
let db = Arc::new(db);
let snapshot = Snapshot::new(db.clone());
let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?));
Ok(DatabaseIndex {
db: db,
view: view,
path: path.as_ref().to_path_buf(),
must_die: AtomicBool::new(false)
})
}
fn must_die(&self) {
self.must_die.store(true, Ordering::Relaxed)
}
fn start_update(&self) -> Result<Update, Box<Error>> {
let schema = match self.db.get(DATA_SCHEMA)? {
Some(value) => Schema::read_from_bin(&*value)?,
None => panic!("Database does not contain a schema"),
};
Ok(Update::new(schema))
}
fn commit_update(&self, update: Update) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let batch = update.build()?;
self.db.write(batch)?;
self.db.compact_range(None, None);
self.db.flush(true)?;
let snapshot = Snapshot::new(self.db.clone());
let view = Arc::new(DatabaseView::new(snapshot)?);
self.view.store(view.clone());
Ok(view)
}
fn view(&self) -> Arc<DatabaseView<Arc<DB>>> {
self.view.load()
}
fn get_config(&self) -> Config {
self.view().config().clone()
}
fn update_config(&self, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
let data = bincode::serialize(&config)?;
self.db.put(CONFIG, &data)?;
let snapshot = Snapshot::new(self.db.clone());
let view = Arc::new(DatabaseView::new(snapshot)?);
self.view.store(view.clone());
Ok(view)
}
fn path(&self) -> &Path {
self.path.as_path()
}
}
impl Drop for DatabaseIndex {
fn drop(&mut self) {
if self.must_die.load(Ordering::Relaxed) {
if let Err(err) = fs::remove_dir_all(&self.path) {
error!("Impossible to remove mdb when Database is dropped; {}", err);
}
}
}
}
pub struct Database {
indexes: Map<String, Arc<DatabaseIndex>>,
path: PathBuf,
}
impl Database {
pub fn create<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
Ok(Database {
indexes: Map::new(),
path: path.as_ref().to_path_buf(),
})
}
pub fn open<P: AsRef<Path>>(path: P) -> Result<Database, Box<Error>> {
let entries = fs::read_dir(&path)?;
let indexes = Map::new();
for entry in entries {
let path = match entry {
Ok(p) => p.path(),
Err(err) => {
warn!("Impossible to retrieve the path from an entry; {}", err);
continue
}
};
let name = match path.file_stem().and_then(OsStr::to_str) {
Some(name) => name.to_owned(),
None => continue
};
let db = match DatabaseIndex::open(path.clone()) {
Ok(db) => db,
Err(err) => {
warn!("Impossible to open the database; {}", err);
continue
}
};
info!("Load database {}", name);
indexes.insert(name, Arc::new(db));
}
Ok(Database {
indexes: indexes,
path: path.as_ref().to_path_buf(),
})
}
pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box<Error>> {
let index_path = self.path.join(name);
if index_path.exists() {
return Err("Index already exists".into());
}
let index = DatabaseIndex::create(index_path, schema)?;
self.indexes.insert(name.to_owned(), Arc::new(index));
Ok(())
}
pub fn delete_index(&self, name: &str) -> Result<(), Box<Error>> {
let index_guard = self.indexes.remove(name).ok_or("Index not found")?;
index_guard.val().must_die();
Ok(())
}
pub fn list_indexes(&self) -> Vec<String> {
self.indexes.iter().map(|g| g.key().clone()).collect()
}
pub fn start_update(&self, index: &str) -> Result<IndexUpdate, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
let update = index_guard.val().start_update()?;
Ok(IndexUpdate { index: index.to_owned(), update })
}
pub fn commit_update(&self, update: IndexUpdate)-> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?;
index_guard.val().commit_update(update.update)
}
pub fn view(&self, index: &str) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
Ok(index_guard.val().view())
}
pub fn get_config(&self, index: &str) -> Result<Config, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
Ok(index_guard.val().get_config())
}
pub fn update_config(&self, index: &str, config: Config) -> Result<Arc<DatabaseView<Arc<DB>>>, Box<Error>>{
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
Ok(index_guard.val().update_config(config)?)
}
pub fn path(&self) -> &Path {
self.path.as_path()
}
pub fn index_path(&self, index: &str) -> Result<PathBuf, Box<Error>> {
let index_guard = self.indexes.get(index).ok_or("Index not found")?;
let path = index_guard.val().path();
Ok(path.to_path_buf())
}
}
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use std::error::Error;
use serde_derive::{Serialize, Deserialize};
use crate::database::schema::{SchemaBuilder, STORED, INDEXED};
use crate::tokenizer::DefaultBuilder;
use super::*;
#[test]
fn ingest_one_easy_update() -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let meilidb_path = dir.path().join("meilidb.mdb");
let meilidb_index_name = "default";
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&meilidb_path)?;
database.create_index(meilidb_index_name, &schema)?;
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let tokenizer_builder = DefaultBuilder::new();
let mut builder = database.start_update(meilidb_index_name)?;
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
let view = database.commit_update(builder)?;
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
Ok(dir.close()?)
}
#[test]
fn ingest_two_easy_updates() -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let meilidb_path = dir.path().join("meilidb.mdb");
let meilidb_index_name = "default";
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
struct SimpleDoc {
id: u64,
title: String,
description: String,
timestamp: u64,
}
let schema = {
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("id", STORED);
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
builder.new_attribute("timestamp", STORED);
builder.build()
};
let database = Database::create(&meilidb_path)?;
database.create_index(meilidb_index_name, &schema)?;
let doc0 = SimpleDoc {
id: 0,
title: String::from("I am a title"),
description: String::from("I am a description"),
timestamp: 1234567,
};
let doc1 = SimpleDoc {
id: 1,
title: String::from("I am the second title"),
description: String::from("I am the second description"),
timestamp: 7654321,
};
let doc2 = SimpleDoc {
id: 2,
title: String::from("I am the third title"),
description: String::from("I am the third description"),
timestamp: 7654321,
};
let doc3 = SimpleDoc {
id: 3,
title: String::from("I am the fourth title"),
description: String::from("I am the fourth description"),
timestamp: 7654321,
};
let tokenizer_builder = DefaultBuilder::new();
let mut builder = database.start_update(meilidb_index_name)?;
let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?;
let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?;
database.commit_update(builder)?;
let mut builder = database.start_update(meilidb_index_name)?;
let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?;
let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?;
let view = database.commit_update(builder)?;
let de_doc0: SimpleDoc = view.document_by_id(docid0)?;
let de_doc1: SimpleDoc = view.document_by_id(docid1)?;
assert_eq!(doc0, de_doc0);
assert_eq!(doc1, de_doc1);
let de_doc2: SimpleDoc = view.document_by_id(docid2)?;
let de_doc3: SimpleDoc = view.document_by_id(docid3)?;
assert_eq!(doc2, de_doc2);
assert_eq!(doc3, de_doc3);
Ok(dir.close()?)
}
}
#[cfg(all(feature = "nightly", test))]
mod bench {
extern crate test;
use std::collections::HashSet;
use std::error::Error;
use std::iter::repeat_with;
use self::test::Bencher;
use rand::distributions::Alphanumeric;
use rand_xorshift::XorShiftRng;
use rand::{Rng, SeedableRng};
use serde_derive::Serialize;
use rand::seq::SliceRandom;
use crate::tokenizer::DefaultBuilder;
use crate::database::schema::*;
use super::*;
fn random_sentences<R: Rng>(number: usize, rng: &mut R) -> String {
let mut words = String::new();
for i in 0..number {
let word_len = rng.gen_range(1, 12);
let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len);
words.extend(iter);
if i == number - 1 { // last word
let final_ = [".", "?", "!", "..."].choose(rng).cloned();
words.extend(final_);
} else {
let middle = [",", ", "].choose(rng).cloned();
words.extend(middle);
}
}
words
}
#[bench]
fn open_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn open_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
#[ignore]
fn open_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
database.commit_update(builder)?;
drop(database);
bench.iter(|| {
let database = Database::open(db_path.clone()).unwrap();
test::black_box(|| database);
});
Ok(())
}
#[bench]
fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..300 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..3000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
#[bench]
#[ignore]
fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box<Error>> {
let dir = tempfile::tempdir()?;
let stop_words = HashSet::new();
let mut builder = SchemaBuilder::with_identifier("id");
builder.new_attribute("title", STORED | INDEXED);
builder.new_attribute("description", STORED | INDEXED);
let schema = builder.build();
let db_path = dir.path().join("bench.mdb");
let index_name = "default";
let database = Database::create(&db_path)?;
database.create_index(index_name, &schema)?;
#[derive(Serialize)]
struct Document {
id: u64,
title: String,
description: String,
}
let tokenizer_builder = DefaultBuilder;
let mut builder = database.start_update(index_name)?;
let mut rng = XorShiftRng::seed_from_u64(42);
for i in 0..30_000 {
let document = Document {
id: i,
title: random_sentences(rng.gen_range(1, 8), &mut rng),
description: random_sentences(rng.gen_range(20, 200), &mut rng),
};
builder.update_document(&document, &tokenizer_builder, &stop_words)?;
}
let view = database.commit_update(builder)?;
bench.iter(|| {
for q in &["a", "b", "c", "d", "e"] {
let documents = view.query_builder().query(q, 0..20);
test::black_box(|| documents);
}
});
Ok(())
}
}

View File

@ -1,98 +0,0 @@
use std::cmp::Ordering;
use std::str::FromStr;
use std::fmt;
use serde_derive::{Serialize, Deserialize};
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone)]
pub enum Number {
Unsigned(u64),
Signed(i64),
Float(f64),
}
impl FromStr for Number {
type Err = ParseNumberError;
fn from_str(s: &str) -> Result<Self, Self::Err> {
if let Ok(unsigned) = u64::from_str(s) {
return Ok(Number::Unsigned(unsigned))
}
if let Ok(signed) = i64::from_str(s) {
return Ok(Number::Signed(signed))
}
if let Ok(float) = f64::from_str(s) {
if float == 0.0 || float.is_normal() {
return Ok(Number::Float(float))
}
}
Err(ParseNumberError)
}
}
impl PartialOrd for Number {
fn partial_cmp(&self, other: &Number) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl Ord for Number {
fn cmp(&self, other: &Number) -> Ordering {
use Number::*;
match (self, other) {
(Unsigned(s), Unsigned(o)) => s.cmp(o),
(Unsigned(s), Signed(o)) => {
let s = i128::from(*s);
let o = i128::from(*o);
s.cmp(&o)
},
(Unsigned(s), Float(o)) => {
let s = *s as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Signed(s), Unsigned(o)) => {
let s = i128::from(*s);
let o = i128::from(*o);
s.cmp(&o)
},
(Signed(s), Signed(o)) => s.cmp(o),
(Signed(s), Float(o)) => {
let s = *s as f64;
s.partial_cmp(o).unwrap_or(Ordering::Equal)
},
(Float(s), Unsigned(o)) => {
let o = *o as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Float(s), Signed(o)) => {
let o = *o as f64;
s.partial_cmp(&o).unwrap_or(Ordering::Equal)
},
(Float(s), Float(o)) => {
s.partial_cmp(o).unwrap_or(Ordering::Equal)
},
}
}
}
impl PartialEq for Number {
fn eq(&self, other: &Number) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl Eq for Number { }
pub struct ParseNumberError;
impl fmt::Display for ParseNumberError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str("can not parse number")
}
}

View File

@ -1,186 +0,0 @@
use std::error::Error;
use std::ops::Deref;
use std::fmt;
use rocksdb::rocksdb::{DB, Snapshot, SeekKey};
use rocksdb::rocksdb_options::ReadOptions;
use serde::forward_to_deserialize_any;
use serde::de::value::MapDeserializer;
use serde::de::{self, Visitor, IntoDeserializer};
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Deserializer<'a, D>
where D: Deref<Target=DB>
{
snapshot: &'a Snapshot<D>,
schema: &'a Schema,
document_id: DocumentId,
}
impl<'a, D> Deserializer<'a, D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: &'a Snapshot<D>, schema: &'a Schema, doc: DocumentId) -> Self {
Deserializer { snapshot, schema, document_id: doc }
}
}
impl<'de, 'a, 'b, D> de::Deserializer<'de> for &'b mut Deserializer<'a, D>
where D: Deref<Target=DB>
{
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_map(visitor)
}
forward_to_deserialize_any! {
bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq
bytes byte_buf unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
fn deserialize_map<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
let mut options = ReadOptions::new();
let lower = DocumentKey::new(self.document_id);
let upper = lower.with_attribute_max();
options.set_iterate_lower_bound(lower.as_ref());
options.set_iterate_upper_bound(upper.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
if iter.kv().is_none() {
// FIXME return an error
}
let iter = iter.map(|(key, value)| {
// retrieve the schema attribute name
// from the schema attribute number
let document_key_attr = DocumentKeyAttr::from_bytes(&key);
let schema_attr = document_key_attr.attribute();
let attribute_name = self.schema.attribute_name(schema_attr);
(attribute_name, Value(value))
});
let map_deserializer = MapDeserializer::new(iter);
visitor.visit_map(map_deserializer)
}
}
struct Value(Vec<u8>);
impl<'de> IntoDeserializer<'de, DeserializerError> for Value {
type Deserializer = Self;
fn into_deserializer(self) -> Self::Deserializer {
self
}
}
macro_rules! forward_to_bincode_values {
($($ty:ident => $de_method:ident,)*) => {
$(
fn $de_method<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: de::Visitor<'de>
{
match bincode::deserialize::<$ty>(&self.0) {
Ok(val) => val.into_deserializer().$de_method(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
)*
}
}
impl<'de, 'a> de::Deserializer<'de> for Value {
type Error = DeserializerError;
fn deserialize_any<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.0.into_deserializer().deserialize_any(visitor)
}
fn deserialize_str<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_string(visitor)
}
fn deserialize_string<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<String>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_string(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
fn deserialize_bytes<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
self.deserialize_byte_buf(visitor)
}
fn deserialize_byte_buf<V>(self, visitor: V) -> Result<V::Value, Self::Error>
where V: Visitor<'de>
{
match bincode::deserialize::<Vec<u8>>(&self.0) {
Ok(val) => val.into_deserializer().deserialize_byte_buf(visitor),
Err(e) => Err(de::Error::custom(e)),
}
}
forward_to_bincode_values! {
char => deserialize_char,
bool => deserialize_bool,
u8 => deserialize_u8,
u16 => deserialize_u16,
u32 => deserialize_u32,
u64 => deserialize_u64,
i8 => deserialize_i8,
i16 => deserialize_i16,
i32 => deserialize_i32,
i64 => deserialize_i64,
f32 => deserialize_f32,
f64 => deserialize_f64,
}
forward_to_deserialize_any! {
unit seq map
unit_struct tuple_struct
identifier tuple ignored_any option newtype_struct enum struct
}
}
#[derive(Debug)]
pub enum DeserializerError {
Custom(String),
}
impl de::Error for DeserializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
DeserializerError::Custom(msg.to_string())
}
}
impl fmt::Display for DeserializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
DeserializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for DeserializerError {}

View File

@ -1,194 +0,0 @@
use std::collections::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::tokenizer::TokenizerBuilder;
use crate::tokenizer::Token;
use crate::{is_cjk, DocumentId, DocIndex};
pub struct IndexerSerializer<'a, 'b, B> {
pub tokenizer_builder: &'a B,
pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId,
pub attribute: SchemaAttr,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, v: &str) -> Result<Self::Ok, Self::Error> {
for token in self.tokenizer_builder.build(v) {
let Token { word, word_index, char_index } = token;
let document_id = self.document_id;
// FIXME must u32::try_from instead
let attribute = self.attribute.0;
let word_index = word_index as u16;
// insert the exact representation
let word_lower = word.to_lowercase();
let length = word.chars().count() as u16;
if self.stop_words.contains(&word_lower) { continue }
// and the unidecoded lowercased version
if !word_lower.chars().any(is_cjk) {
let word_unidecoded = unidecode::unidecode(word).to_lowercase();
let word_unidecoded = word_unidecoded.trim();
if word_lower != word_unidecoded {
let char_index = char_index as u16;
let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?;
}
}
let char_index = char_index as u16;
let char_length = length;
let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length };
self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?;
}
Ok(())
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "seq" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Err(SerializerError::UnserializableType { name: "map" })
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct" })
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}

View File

@ -1,65 +0,0 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
use std::error::Error;
use std::fmt;
use serde::ser;
macro_rules! forward_to_unserializable_type {
($($ty:ident => $se_method:ident,)*) => {
$(
fn $se_method(self, _v: $ty) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "$ty" })
}
)*
}
}
pub mod find_id;
pub mod key_to_string;
pub mod value_to_number;
pub mod serializer;
pub mod indexer_serializer;
pub mod deserializer;
pub fn calculate_hash<T: Hash>(t: &T) -> u64 {
let mut s = DefaultHasher::new();
t.hash(&mut s);
s.finish()
}
#[derive(Debug)]
pub enum SerializerError {
DocumentIdNotFound,
UnserializableType { name: &'static str },
Custom(String),
}
impl ser::Error for SerializerError {
fn custom<T: fmt::Display>(msg: T) -> Self {
SerializerError::Custom(msg.to_string())
}
}
impl fmt::Display for SerializerError {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
match self {
SerializerError::DocumentIdNotFound => {
write!(f, "serialized document does not have an id according to the schema")
}
SerializerError::UnserializableType { name } => {
write!(f, "Only struct and map types are considered valid documents and
can be serialized, not {} types directly.", name)
},
SerializerError::Custom(s) => f.write_str(&s),
}
}
}
impl Error for SerializerError {}
impl From<String> for SerializerError {
fn from(value: String) -> SerializerError {
SerializerError::Custom(value)
}
}

View File

@ -1,296 +0,0 @@
use std::collections::HashSet;
use serde::Serialize;
use serde::ser;
use crate::database::serde::indexer_serializer::IndexerSerializer;
use crate::database::serde::key_to_string::KeyToStringSerializer;
use crate::database::serde::value_to_number::ValueToNumberSerializer;
use crate::database::update::DocumentUpdate;
use crate::database::serde::SerializerError;
use crate::tokenizer::TokenizerBuilder;
use crate::database::schema::Schema;
use crate::DocumentId;
pub struct Serializer<'a, 'b, B> {
pub schema: &'a Schema,
pub update: &'a mut DocumentUpdate<'b>,
pub document_id: DocumentId,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
type SerializeSeq = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTuple = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleStruct = ser::Impossible<Self::Ok, Self::Error>;
type SerializeTupleVariant = ser::Impossible<Self::Ok, Self::Error>;
type SerializeMap = MapSerializer<'a, 'b, B>;
type SerializeStruct = StructSerializer<'a, 'b, B>;
type SerializeStructVariant = ser::Impossible<Self::Ok, Self::Error>;
forward_to_unserializable_type! {
bool => serialize_bool,
char => serialize_char,
i8 => serialize_i8,
i16 => serialize_i16,
i32 => serialize_i32,
i64 => serialize_i64,
u8 => serialize_u8,
u16 => serialize_u16,
u32 => serialize_u32,
u64 => serialize_u64,
f32 => serialize_f32,
f64 => serialize_f64,
}
fn serialize_str(self, _v: &str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "str" })
}
fn serialize_bytes(self, _v: &[u8]) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "&[u8]" })
}
fn serialize_none(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_some<T: ?Sized>(self, _value: &T) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "Option" })
}
fn serialize_unit(self) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "()" })
}
fn serialize_unit_struct(self, _name: &'static str) -> Result<Self::Ok, Self::Error> {
Err(SerializerError::UnserializableType { name: "unit struct" })
}
fn serialize_unit_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str
) -> Result<Self::Ok, Self::Error>
{
Err(SerializerError::UnserializableType { name: "unit variant" })
}
fn serialize_newtype_struct<T: ?Sized>(
self,
_name: &'static str,
value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
value.serialize(self)
}
fn serialize_newtype_variant<T: ?Sized>(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_value: &T
) -> Result<Self::Ok, Self::Error>
where T: Serialize,
{
Err(SerializerError::UnserializableType { name: "newtype variant" })
}
fn serialize_seq(self, _len: Option<usize>) -> Result<Self::SerializeSeq, Self::Error> {
Err(SerializerError::UnserializableType { name: "sequence" })
}
fn serialize_tuple(self, _len: usize) -> Result<Self::SerializeTuple, Self::Error> {
Err(SerializerError::UnserializableType { name: "tuple" })
}
fn serialize_tuple_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeTupleStruct, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple struct" })
}
fn serialize_tuple_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeTupleVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "tuple variant" })
}
fn serialize_map(self, _len: Option<usize>) -> Result<Self::SerializeMap, Self::Error> {
Ok(MapSerializer {
schema: self.schema,
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
current_key_name: None,
})
}
fn serialize_struct(
self,
_name: &'static str,
_len: usize
) -> Result<Self::SerializeStruct, Self::Error>
{
Ok(StructSerializer {
schema: self.schema,
document_id: self.document_id,
update: self.update,
tokenizer_builder: self.tokenizer_builder,
stop_words: self.stop_words,
})
}
fn serialize_struct_variant(
self,
_name: &'static str,
_variant_index: u32,
_variant: &'static str,
_len: usize
) -> Result<Self::SerializeStructVariant, Self::Error>
{
Err(SerializerError::UnserializableType { name: "struct variant" })
}
}
pub struct MapSerializer<'a, 'b, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
pub current_key_name: Option<String>,
}
impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_key<T: ?Sized>(&mut self, key: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
self.current_key_name = Some(key);
Ok(())
}
fn serialize_value<T: ?Sized>(&mut self, value: &T) -> Result<(), Self::Error>
where T: Serialize,
{
let key = self.current_key_name.take().unwrap();
self.serialize_entry(&key, value)
}
fn serialize_entry<K: ?Sized, V: ?Sized>(
&mut self,
key: &K,
value: &V,
) -> Result<(), Self::Error>
where K: Serialize, V: Serialize,
{
let key = key.serialize(KeyToStringSerializer)?;
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, &value)?;
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
if props.is_ranked() {
let number = value.serialize(ValueToNumberSerializer)?;
self.update.register_ranked_attribute(attr, number)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}
pub struct StructSerializer<'a, 'b, B> {
pub schema: &'a Schema,
pub document_id: DocumentId,
pub update: &'a mut DocumentUpdate<'b>,
pub tokenizer_builder: &'a B,
pub stop_words: &'a HashSet<String>,
}
impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B>
where B: TokenizerBuilder
{
type Ok = ();
type Error = SerializerError;
fn serialize_field<T: ?Sized>(
&mut self,
key: &'static str,
value: &T
) -> Result<(), Self::Error>
where T: Serialize,
{
if let Some(attr) = self.schema.attribute(key) {
let props = self.schema.props(attr);
if props.is_stored() {
let value = bincode::serialize(value).unwrap();
self.update.insert_attribute_value(attr, &value)?;
}
if props.is_indexed() {
let serializer = IndexerSerializer {
update: self.update,
tokenizer_builder: self.tokenizer_builder,
document_id: self.document_id,
attribute: attr,
stop_words: self.stop_words,
};
value.serialize(serializer)?;
}
if props.is_ranked() {
let integer = value.serialize(ValueToNumberSerializer)?;
self.update.register_ranked_attribute(attr, integer)?;
}
}
Ok(())
}
fn end(self) -> Result<Self::Ok, Self::Error> {
Ok(())
}
}

View File

@ -1,55 +0,0 @@
use std::error::Error;
use byteorder::{ReadBytesExt, WriteBytesExt};
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::database::Index;
use crate::data::DocIds;
pub enum WriteIndexEvent<'a> {
RemovedDocuments(&'a DocIds),
UpdatedDocuments(&'a Index),
}
impl<'a> WriteToBytes for WriteIndexEvent<'a> {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
match self {
WriteIndexEvent::RemovedDocuments(doc_ids) => {
let _ = bytes.write_u8(0);
doc_ids.write_to_bytes(bytes);
},
WriteIndexEvent::UpdatedDocuments(index) => {
let _ = bytes.write_u8(1);
index.write_to_bytes(bytes);
}
}
}
}
pub enum ReadIndexEvent {
RemovedDocuments(DocIds),
UpdatedDocuments(Index),
}
impl ReadIndexEvent {
pub fn updated_documents(self) -> Option<Index> {
use ReadIndexEvent::*;
match self {
RemovedDocuments(_) => None,
UpdatedDocuments(index) => Some(index),
}
}
}
impl FromSharedDataCursor for ReadIndexEvent {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
match cursor.read_u8()? {
0 => DocIds::from_shared_data_cursor(cursor).map(ReadIndexEvent::RemovedDocuments),
1 => Index::from_shared_data_cursor(cursor).map(ReadIndexEvent::UpdatedDocuments),
_ => unreachable!(),
}
}
}

View File

@ -1,239 +0,0 @@
use std::collections::{HashSet, BTreeMap};
use std::error::Error;
use rocksdb::rocksdb::{Writable, WriteBatch};
use hashbrown::hash_map::HashMap;
use sdset::{Set, SetBuf};
use serde::Serialize;
use crate::database::document_key::{DocumentKey, DocumentKeyAttr};
use crate::database::serde::serializer::Serializer;
use crate::database::serde::SerializerError;
use crate::database::schema::SchemaAttr;
use crate::database::schema::Schema;
use crate::database::index::IndexBuilder;
use crate::database::{DATA_INDEX, DATA_RANKED_MAP};
use crate::database::{RankedMap, Number};
use crate::tokenizer::TokenizerBuilder;
use crate::write_to_bytes::WriteToBytes;
use crate::data::DocIds;
use crate::{DocumentId, DocIndex};
pub use self::index_event::{ReadIndexEvent, WriteIndexEvent};
pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent};
mod index_event;
mod ranked_map_event;
pub type Token = Vec<u8>; // TODO could be replaced by a SmallVec
pub struct Update {
schema: Schema,
raw_builder: RawUpdateBuilder,
}
impl Update {
pub(crate) fn new(schema: Schema) -> Update {
Update { schema, raw_builder: RawUpdateBuilder::new() }
}
pub fn update_document<T, B>(
&mut self,
document: T,
tokenizer_builder: &B,
stop_words: &HashSet<String>,
) -> Result<DocumentId, SerializerError>
where T: Serialize,
B: TokenizerBuilder,
{
let document_id = self.schema.document_id(&document)?;
let serializer = Serializer {
schema: &self.schema,
document_id: document_id,
tokenizer_builder: tokenizer_builder,
update: &mut self.raw_builder.document_update(document_id)?,
stop_words: stop_words,
};
document.serialize(serializer)?;
Ok(document_id)
}
pub fn remove_document<T>(&mut self, document: T) -> Result<DocumentId, SerializerError>
where T: Serialize,
{
let document_id = self.schema.document_id(&document)?;
self.raw_builder.document_update(document_id)?.remove()?;
Ok(document_id)
}
pub(crate) fn build(self) -> Result<WriteBatch, Box<Error>> {
self.raw_builder.build()
}
}
#[derive(Copy, Clone, PartialEq, Eq)]
enum UpdateType {
Updated,
Deleted,
}
use UpdateType::{Updated, Deleted};
pub struct RawUpdateBuilder {
documents_update: HashMap<DocumentId, UpdateType>,
documents_ranked_fields: RankedMap,
indexed_words: BTreeMap<Token, Vec<DocIndex>>,
batch: WriteBatch,
}
impl RawUpdateBuilder {
pub fn new() -> RawUpdateBuilder {
RawUpdateBuilder {
documents_update: HashMap::new(),
documents_ranked_fields: HashMap::new(),
indexed_words: BTreeMap::new(),
batch: WriteBatch::new(),
}
}
pub fn document_update(&mut self, document_id: DocumentId) -> Result<DocumentUpdate, SerializerError> {
use serde::ser::Error;
match self.documents_update.get(&document_id) {
Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }),
Some(Updated) => Err(SerializerError::custom(
"This document has already been removed and cannot be updated in the same update"
)),
}
}
pub fn build(self) -> Result<WriteBatch, Box<Error>> {
// create the list of all the removed documents
let removed_documents = {
let mut document_ids = Vec::new();
for (id, update_type) in self.documents_update {
if update_type == Deleted {
document_ids.push(id);
}
}
document_ids.sort_unstable();
let setbuf = SetBuf::new_unchecked(document_ids);
DocIds::new(&setbuf)
};
// create the Index of all the document updates
let index = {
let mut builder = IndexBuilder::new();
for (key, mut indexes) in self.indexed_words {
indexes.sort_unstable();
let indexes = Set::new_unchecked(&indexes);
builder.insert(key, indexes).unwrap();
}
builder.build()
};
// WARN: removed documents must absolutely
// be merged *before* document updates
// === index ===
if !removed_documents.is_empty() {
// remove the documents using the appropriate IndexEvent
let event_bytes = WriteIndexEvent::RemovedDocuments(&removed_documents).into_bytes();
self.batch.merge(DATA_INDEX, &event_bytes)?;
}
// update the documents using the appropriate IndexEvent
let event_bytes = WriteIndexEvent::UpdatedDocuments(&index).into_bytes();
self.batch.merge(DATA_INDEX, &event_bytes)?;
// === ranked map ===
if !removed_documents.is_empty() {
// update the ranked map using the appropriate RankedMapEvent
let event_bytes = WriteRankedMapEvent::RemovedDocuments(&removed_documents).into_bytes();
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
}
// update the documents using the appropriate IndexEvent
let event_bytes = WriteRankedMapEvent::UpdatedDocuments(&self.documents_ranked_fields).into_bytes();
self.batch.merge(DATA_RANKED_MAP, &event_bytes)?;
Ok(self.batch)
}
}
pub struct DocumentUpdate<'a> {
document_id: DocumentId,
inner: &'a mut RawUpdateBuilder,
}
impl<'a> DocumentUpdate<'a> {
pub fn remove(&mut self) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) {
return Err(SerializerError::custom(
"This document has already been updated and cannot be removed in the same update"
));
}
let start = DocumentKey::new(self.document_id).with_attribute_min();
let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1
self.inner.batch.delete_range(start.as_ref(), end.as_ref())?;
Ok(())
}
pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted and cannot be updated in the same update"
));
}
let key = DocumentKeyAttr::new(self.document_id, attr);
self.inner.batch.put(key.as_ref(), &value)?;
Ok(())
}
pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> {
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted and cannot be updated in the same update"
));
}
self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index);
Ok(())
}
pub fn register_ranked_attribute(
&mut self,
attr: SchemaAttr,
number: Number,
) -> Result<(), SerializerError>
{
use serde::ser::Error;
if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) {
return Err(SerializerError::custom(
"This document has already been deleted, ranked attributes cannot be added in the same update"
));
}
self.inner.documents_ranked_fields.insert((self.document_id, attr), number);
Ok(())
}
}

View File

@ -1,58 +0,0 @@
use std::error::Error;
use byteorder::{ReadBytesExt, WriteBytesExt};
use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor};
use crate::write_to_bytes::WriteToBytes;
use crate::database::RankedMap;
use crate::data::DocIds;
pub enum WriteRankedMapEvent<'a> {
RemovedDocuments(&'a DocIds),
UpdatedDocuments(&'a RankedMap),
}
impl<'a> WriteToBytes for WriteRankedMapEvent<'a> {
fn write_to_bytes(&self, bytes: &mut Vec<u8>) {
match self {
WriteRankedMapEvent::RemovedDocuments(doc_ids) => {
let _ = bytes.write_u8(0);
doc_ids.write_to_bytes(bytes);
},
WriteRankedMapEvent::UpdatedDocuments(ranked_map) => {
let _ = bytes.write_u8(1);
bincode::serialize_into(bytes, ranked_map).unwrap()
}
}
}
}
pub enum ReadRankedMapEvent {
RemovedDocuments(DocIds),
UpdatedDocuments(RankedMap),
}
impl ReadRankedMapEvent {
pub fn updated_documents(self) -> Option<RankedMap> {
use ReadRankedMapEvent::*;
match self {
RemovedDocuments(_) => None,
UpdatedDocuments(ranked_map) => Some(ranked_map),
}
}
}
impl FromSharedDataCursor for ReadRankedMapEvent {
type Error = Box<Error>;
fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result<Self, Self::Error> {
match cursor.read_u8()? {
0 => DocIds::from_shared_data_cursor(cursor).map(ReadRankedMapEvent::RemovedDocuments),
1 => {
let ranked_map = bincode::deserialize_from(cursor)?;
Ok(ReadRankedMapEvent::UpdatedDocuments(ranked_map))
},
_ => unreachable!(),
}
}
}

View File

@ -1,201 +0,0 @@
use std::error::Error;
use std::path::Path;
use std::ops::Deref;
use std::{fmt, marker};
use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions};
use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter};
use serde::de::DeserializeOwned;
use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config};
use crate::database::serde::deserializer::Deserializer;
use crate::database::{DocumentKey, DocumentKeyAttr};
use crate::rank::{QueryBuilder, FilterFunc};
use crate::database::schema::Schema;
use crate::database::index::Index;
use crate::database::RankedMap;
use crate::database::Config;
use crate::DocumentId;
pub struct DatabaseView<D>
where D: Deref<Target=DB>
{
snapshot: Snapshot<D>,
index: Index,
ranked_map: RankedMap,
schema: Schema,
config: Config,
}
impl<D> DatabaseView<D>
where D: Deref<Target=DB>
{
pub fn new(snapshot: Snapshot<D>) -> Result<DatabaseView<D>, Box<Error>> {
let schema = retrieve_data_schema(&snapshot)?;
let index = retrieve_data_index(&snapshot)?;
let ranked_map = retrieve_data_ranked_map(&snapshot)?;
let config = retrieve_config(&snapshot)?;
Ok(DatabaseView { snapshot, index, ranked_map, schema, config })
}
pub fn schema(&self) -> &Schema {
&self.schema
}
pub fn index(&self) -> &Index {
&self.index
}
pub fn ranked_map(&self) -> &RankedMap {
&self.ranked_map
}
pub fn into_snapshot(self) -> Snapshot<D> {
self.snapshot
}
pub fn snapshot(&self) -> &Snapshot<D> {
&self.snapshot
}
pub fn config(&self) -> &Config {
&self.config
}
pub fn get(&self, key: &[u8]) -> Result<Option<DBVector>, Box<Error>> {
Ok(self.snapshot.get(key)?)
}
pub fn dump_all<P: AsRef<Path>>(&self, path: P) -> Result<(), Box<Error>> {
let path = path.as_ref().to_string_lossy();
let env_options = EnvOptions::new();
let column_family_options = ColumnFamilyOptions::new();
let mut file_writer = SstFileWriter::new(env_options, column_family_options);
file_writer.open(&path)?;
let mut iter = self.snapshot.iter();
iter.seek(SeekKey::Start);
for (key, value) in &mut iter {
file_writer.put(&key, &value)?;
}
file_writer.finish()?;
Ok(())
}
pub fn query_builder(&self) -> QueryBuilder<FilterFunc> {
QueryBuilder::new(self.index())
}
pub fn raw_field_by_document_id(
&self,
name: &str,
id: DocumentId
) -> Result<Option<Vec<u8>>, Box<Error>>
{
let attr = self.schema.attribute(name).ok_or("field not found")?;
let key = DocumentKeyAttr::new(id, attr);
let vector = self.snapshot.get(key.as_ref())?;
Ok(vector.map(|v| v.to_vec()))
}
pub fn document_by_id<T>(&self, id: DocumentId) -> Result<T, Box<Error>>
where T: DeserializeOwned,
{
let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id);
Ok(T::deserialize(&mut deserializer)?)
}
pub fn documents_by_id<T, I>(&self, ids: I) -> DocumentIter<D, T, I::IntoIter>
where T: DeserializeOwned,
I: IntoIterator<Item=DocumentId>,
{
DocumentIter {
database_view: self,
document_ids: ids.into_iter(),
_phantom: marker::PhantomData,
}
}
}
impl<D> fmt::Debug for DatabaseView<D>
where D: Deref<Target=DB>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let mut options = ReadOptions::new();
let lower = DocumentKey::new(DocumentId(0));
options.set_iterate_lower_bound(lower.as_ref());
let mut iter = self.snapshot.iter_opt(options);
iter.seek(SeekKey::Start);
let iter = iter.map(|(key, _)| DocumentKeyAttr::from_bytes(&key));
if f.alternate() {
writeln!(f, "DatabaseView(")?;
} else {
write!(f, "DatabaseView(")?;
}
self.schema.fmt(f)?;
if f.alternate() {
writeln!(f, ",")?;
} else {
write!(f, ", ")?;
}
f.debug_list().entries(iter).finish()?;
write!(f, ")")
}
}
// TODO this is just an iter::Map !!!
pub struct DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>
{
database_view: &'a DatabaseView<D>,
document_ids: I,
_phantom: marker::PhantomData<T>,
}
impl<'a, D, T, I> Iterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: Iterator<Item=DocumentId>,
{
type Item = Result<T, Box<Error>>;
fn size_hint(&self) -> (usize, Option<usize>) {
self.document_ids.size_hint()
}
fn next(&mut self) -> Option<Self::Item> {
match self.document_ids.next() {
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}
}
impl<'a, D, T, I> ExactSizeIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: ExactSizeIterator + Iterator<Item=DocumentId>,
{ }
impl<'a, D, T, I> DoubleEndedIterator for DocumentIter<'a, D, T, I>
where D: Deref<Target=DB>,
T: DeserializeOwned,
I: DoubleEndedIterator + Iterator<Item=DocumentId>,
{
fn next_back(&mut self) -> Option<Self::Item> {
match self.document_ids.next_back() {
Some(id) => Some(self.database_view.document_by_id(id)),
None => None
}
}
}

View File

@ -1,136 +0,0 @@
#![cfg_attr(feature = "nightly", feature(test))]
pub mod automaton;
pub mod database;
pub mod data;
pub mod rank;
pub mod tokenizer;
mod common_words;
mod shared_data_cursor;
mod write_to_bytes;
use serde_derive::{Serialize, Deserialize};
pub use rocksdb;
pub use self::tokenizer::Tokenizer;
pub use self::common_words::CommonWords;
pub fn is_cjk(c: char) -> bool {
(c >= '\u{2e80}' && c <= '\u{2eff}') ||
(c >= '\u{2f00}' && c <= '\u{2fdf}') ||
(c >= '\u{3040}' && c <= '\u{309f}') ||
(c >= '\u{30a0}' && c <= '\u{30ff}') ||
(c >= '\u{3100}' && c <= '\u{312f}') ||
(c >= '\u{3200}' && c <= '\u{32ff}') ||
(c >= '\u{3400}' && c <= '\u{4dbf}') ||
(c >= '\u{4e00}' && c <= '\u{9fff}') ||
(c >= '\u{f900}' && c <= '\u{faff}')
}
/// Represent an internally generated document unique identifier.
///
/// It is used to inform the database the document you want to deserialize.
/// Helpful for custom ranking.
#[derive(Serialize, Deserialize)]
#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)]
pub struct DocumentId(u64);
/// This structure represent the position of a word
/// in a document and its attributes.
///
/// This is stored in the map, generated at index time,
/// extracted and interpreted at search time.
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct DocIndex {
/// The document identifier where the word was found.
pub document_id: DocumentId,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
/// This structure represent a matching word with informations
/// on the location of the word in the document.
///
/// The order of the field is important because it defines
/// the way these structures are ordered between themselves.
///
/// The word in itself is not important.
// TODO do data oriented programming ? very arrays ?
#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct Match {
/// The word index in the query sentence.
/// Same as the `attribute_index` but for the query words.
///
/// Used to retrieve the automaton that match this word.
pub query_index: u32,
/// The distance the word has with the query word
/// (i.e. the Levenshtein distance).
pub distance: u8,
/// The attribute in the document where the word was found
/// along with the index in it.
pub attribute: u16,
pub word_index: u16,
/// Whether the word that match is an exact match or a prefix.
pub is_exact: bool,
/// The position in bytes where the word was found
/// along with the length of it.
///
/// It informs on the original word area in the text indexed
/// without needing to run the tokenizer again.
pub char_index: u16,
pub char_length: u16,
}
impl Match {
pub fn zero() -> Self {
Match {
query_index: 0,
distance: 0,
attribute: 0,
word_index: 0,
is_exact: false,
char_index: 0,
char_length: 0,
}
}
pub fn max() -> Self {
Match {
query_index: u32::max_value(),
distance: u8::max_value(),
attribute: u16::max_value(),
word_index: u16::max_value(),
is_exact: true,
char_index: u16::max_value(),
char_length: u16::max_value(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::mem;
#[test]
fn docindex_mem_size() {
assert_eq!(mem::size_of::<DocIndex>(), 16);
}
}

View File

@ -1,259 +0,0 @@
use std::mem;
use crate::is_cjk;
use self::Separator::*;
pub trait TokenizerBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a>;
}
pub struct DefaultBuilder;
impl DefaultBuilder {
pub fn new() -> DefaultBuilder {
DefaultBuilder
}
}
#[derive(Debug, PartialEq, Eq)]
pub struct Token<'a> {
pub word: &'a str,
pub word_index: usize,
pub char_index: usize,
}
impl TokenizerBuilder for DefaultBuilder {
fn build<'a>(&self, text: &'a str) -> Box<Iterator<Item=Token<'a>> + 'a> {
Box::new(Tokenizer::new(text))
}
}
pub struct Tokenizer<'a> {
word_index: usize,
char_index: usize,
inner: &'a str,
}
impl<'a> Tokenizer<'a> {
pub fn new(string: &str) -> Tokenizer {
let mut char_advance = 0;
let mut index_advance = 0;
for (n, (i, c)) in string.char_indices().enumerate() {
char_advance = n;
index_advance = i;
if detect_separator(c).is_none() { break }
}
Tokenizer {
word_index: 0,
char_index: char_advance,
inner: &string[index_advance..],
}
}
}
#[derive(Debug, Clone, Copy)]
enum Separator {
Short,
Long,
}
impl Separator {
fn add(self, add: Separator) -> Separator {
match (self, add) {
(_, Long) => Long,
(Short, Short) => Short,
(Long, Short) => Long,
}
}
fn to_usize(self) -> usize {
match self {
Short => 1,
Long => 8,
}
}
}
fn detect_separator(c: char) -> Option<Separator> {
match c {
'.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long),
' ' | '\'' | '"' => Some(Short),
_ => None,
}
}
impl<'a> Iterator for Tokenizer<'a> {
type Item = Token<'a>;
fn next(&mut self) -> Option<Self::Item> {
let mut start_word = None;
let mut distance = None;
for (i, c) in self.inner.char_indices() {
match detect_separator(c) {
Some(sep) => {
if let Some(start_word) = start_word {
let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail;
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.char_index += word.chars().count();
return Some(token)
}
distance = Some(distance.map_or(sep, |s| s.add(sep)));
},
None => {
// if this is a Chinese, a Japanese or a Korean character
// See <http://unicode-table.com>
if is_cjk(c) {
match start_word {
Some(start_word) => {
let (prefix, tail) = self.inner.split_at(i);
let (spaces, word) = prefix.split_at(start_word);
self.inner = tail;
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
self.word_index += 1;
self.char_index += word.chars().count();
return Some(token)
},
None => {
let (prefix, tail) = self.inner.split_at(i + c.len_utf8());
let (spaces, word) = prefix.split_at(i);
self.inner = tail;
self.char_index += spaces.chars().count();
self.word_index += distance.map(Separator::to_usize).unwrap_or(0);
let token = Token {
word: word,
word_index: self.word_index,
char_index: self.char_index,
};
if tail.chars().next().and_then(detect_separator).is_none() {
self.word_index += 1;
}
self.char_index += 1;
return Some(token)
}
}
}
if start_word.is_none() { start_word = Some(i) }
},
}
}
if let Some(start_word) = start_word {
let prefix = mem::replace(&mut self.inner, "");
let (spaces, word) = prefix.split_at(start_word);
let token = Token {
word: word,
word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0),
char_index: self.char_index + spaces.chars().count(),
};
return Some(token)
}
None
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn easy() {
let mut tokenizer = Tokenizer::new("salut");
assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard() {
let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 }));
assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_long_chars() {
let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 }));
assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,");
assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 }));
assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 }));
assert_eq!(tokenizer.next(), None);
}
#[test]
fn hard_kanjis() {
let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 }));
assert_eq!(tokenizer.next(), None);
let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}");
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 }));
assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 }));
assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 }));
assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 }));
assert_eq!(tokenizer.next(), None);
}
}