diff --git a/.gitignore b/.gitignore index 5768350a8..c38aa51d3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ -/rocksdb /target /Cargo.lock +meilidb/Cargo.lock +meilidb-core/Cargo.lock **/*.rs.bk **/*.csv **/*.json_lines diff --git a/Cargo.toml b/Cargo.toml index dd994020d..69297052b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,55 +1,10 @@ -[package] -edition = "2018" -name = "meilidb" -version = "0.3.2" -authors = ["Kerollmops "] - -[dependencies] -arc-swap = "0.3.7" -bincode = "1.1.2" -byteorder = "1.3.1" -fst = "0.3.3" -hashbrown = { version = "0.1.8", features = ["serde"] } -lazy_static = "1.2.0" -levenshtein_automata = { version = "0.1.1", features = ["fst_automaton"] } -linked-hash-map = { version = "0.5.1", features = ["serde_impl"] } -lockfree = "0.5.1" -log = "0.4.6" -rayon = "1.0.3" -sdset = "0.3.1" -serde = "1.0.88" -serde_derive = "1.0.88" -serde_json = { version = "1.0.38", features = ["preserve_order"] } -size_format = "1.0.2" -slice-group-by = "0.2.4" -unidecode = "0.3.0" - -[dependencies.toml] -git = "https://github.com/Kerollmops/toml-rs.git" -features = ["preserve_order"] -rev = "0372ba6" - -[dependencies.rocksdb] -git = "https://github.com/pingcap/rust-rocksdb.git" -rev = "306e201" - -[features] -default = ["simd"] -i128 = ["bincode/i128", "byteorder/i128"] -portable = ["rocksdb/portable"] -simd = ["rocksdb/sse"] -nightly = ["hashbrown/nightly", "slice-group-by/nightly"] - -[dev-dependencies] -csv = "1.0.5" -env_logger = "0.6.0" -jemallocator = "0.1.9" -quickcheck = "0.8.2" -rand = "0.6.5" -rand_xorshift = "0.1.1" -structopt = "0.2.14" -tempfile = "3.0.7" -termcolor = "1.0.4" +[workspace] +members = [ + "meilidb", + "meilidb-core", + "meilidb-data", + "meilidb-tokenizer", +] [profile.release] debug = true diff --git a/examples/ebay/schema-example.toml b/examples/ebay/schema-example.toml deleted file mode 100644 index fcf2685e9..000000000 --- a/examples/ebay/schema-example.toml +++ /dev/null @@ -1,19 +0,0 @@ -# This schema has been generated ... -# The order in which the attributes are declared is important, -# it specify the attribute xxx... - -identifier = "id" - -[attributes.id] -stored = true - -[attributes.title] -stored = true -indexed = true - -[attributes.description] -stored = true -indexed = true - -[attributes.image] -stored = true diff --git a/meilidb-core/Cargo.toml b/meilidb-core/Cargo.toml new file mode 100644 index 000000000..16bc204d4 --- /dev/null +++ b/meilidb-core/Cargo.toml @@ -0,0 +1,29 @@ +[package] +name = "meilidb-core" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +byteorder = "1.3.1" +hashbrown = "0.2.2" +lazy_static = "1.2.0" +log = "0.4.6" +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } +rayon = "1.0.3" +sdset = "0.3.1" +serde = { version = "1.0.88", features = ["derive"] } +slice-group-by = "0.2.4" + +[dependencies.fst] +git = "https://github.com/Kerollmops/fst.git" +branch = "arc-byte-slice" + +[dependencies.levenshtein_automata] +git = "https://github.com/Kerollmops/levenshtein-automata.git" +branch = "arc-byte-slice" +features = ["fst_automaton"] + +[features] +i128 = ["byteorder/i128"] +nightly = ["hashbrown/nightly", "slice-group-by/nightly"] diff --git a/src/automaton.rs b/meilidb-core/src/automaton.rs similarity index 100% rename from src/automaton.rs rename to meilidb-core/src/automaton.rs diff --git a/src/rank/criterion/document_id.rs b/meilidb-core/src/criterion/document_id.rs similarity index 76% rename from src/rank/criterion/document_id.rs rename to meilidb-core/src/criterion/document_id.rs index 8e4cf91b5..27025a2da 100644 --- a/src/rank/criterion/document_id.rs +++ b/meilidb-core/src/criterion/document_id.rs @@ -1,7 +1,6 @@ use std::cmp::Ordering; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[derive(Debug, Clone, Copy)] pub struct DocumentId; diff --git a/src/rank/criterion/exact.rs b/meilidb-core/src/criterion/exact.rs similarity index 92% rename from src/rank/criterion/exact.rs rename to meilidb-core/src/criterion/exact.rs index 6933aaff5..b76e9ace5 100644 --- a/src/rank/criterion/exact.rs +++ b/meilidb-core/src/criterion/exact.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn number_exact_matches(query_index: &[u32], is_exact: &[bool]) -> usize { diff --git a/src/rank/criterion/mod.rs b/meilidb-core/src/criterion/mod.rs similarity index 97% rename from src/rank/criterion/mod.rs rename to meilidb-core/src/criterion/mod.rs index 78c1bff5a..2ad3a183c 100644 --- a/src/rank/criterion/mod.rs +++ b/meilidb-core/src/criterion/mod.rs @@ -4,11 +4,10 @@ mod words_proximity; mod sum_of_words_attribute; mod sum_of_words_position; mod exact; -mod sort_by_attr; mod document_id; use std::cmp::Ordering; -use crate::rank::RawDocument; +use crate::RawDocument; pub use self::{ sum_of_typos::SumOfTypos, @@ -17,7 +16,6 @@ pub use self::{ sum_of_words_attribute::SumOfWordsAttribute, sum_of_words_position::SumOfWordsPosition, exact::Exact, - sort_by_attr::SortByAttr, document_id::DocumentId, }; diff --git a/src/rank/criterion/number_of_words.rs b/meilidb-core/src/criterion/number_of_words.rs similarity index 89% rename from src/rank/criterion/number_of_words.rs rename to meilidb-core/src/criterion/number_of_words.rs index 0c6f5a200..798123e6a 100644 --- a/src/rank/criterion/number_of_words.rs +++ b/meilidb-core/src/criterion/number_of_words.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn number_of_query_words(query_index: &[u32]) -> usize { diff --git a/src/rank/criterion/sum_of_typos.rs b/meilidb-core/src/criterion/sum_of_typos.rs similarity index 97% rename from src/rank/criterion/sum_of_typos.rs rename to meilidb-core/src/criterion/sum_of_typos.rs index bbffec870..714766a20 100644 --- a/src/rank/criterion/sum_of_typos.rs +++ b/meilidb-core/src/criterion/sum_of_typos.rs @@ -2,8 +2,8 @@ use std::cmp::Ordering; use slice_group_by::GroupBy; -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; // This function is a wrong logarithmic 10 function. // It is safe to panic on input number higher than 3, diff --git a/src/rank/criterion/sum_of_words_attribute.rs b/meilidb-core/src/criterion/sum_of_words_attribute.rs similarity index 92% rename from src/rank/criterion/sum_of_words_attribute.rs rename to meilidb-core/src/criterion/sum_of_words_attribute.rs index 0a5303490..a46787797 100644 --- a/src/rank/criterion/sum_of_words_attribute.rs +++ b/meilidb-core/src/criterion/sum_of_words_attribute.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn sum_matches_attributes(query_index: &[u32], attribute: &[u16]) -> usize { diff --git a/src/rank/criterion/sum_of_words_position.rs b/meilidb-core/src/criterion/sum_of_words_position.rs similarity index 93% rename from src/rank/criterion/sum_of_words_position.rs rename to meilidb-core/src/criterion/sum_of_words_position.rs index 5938ce5ab..86f4e93fa 100644 --- a/src/rank/criterion/sum_of_words_position.rs +++ b/meilidb-core/src/criterion/sum_of_words_position.rs @@ -1,9 +1,7 @@ use std::cmp::Ordering; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; #[inline] fn sum_matches_attribute_index(query_index: &[u32], word_index: &[u16]) -> usize { diff --git a/src/rank/criterion/words_proximity.rs b/meilidb-core/src/criterion/words_proximity.rs similarity index 98% rename from src/rank/criterion/words_proximity.rs rename to meilidb-core/src/criterion/words_proximity.rs index dbf26e21a..fc6c8bb31 100644 --- a/src/rank/criterion/words_proximity.rs +++ b/meilidb-core/src/criterion/words_proximity.rs @@ -1,9 +1,7 @@ use std::cmp::{self, Ordering}; - use slice_group_by::GroupBy; - -use crate::rank::criterion::Criterion; -use crate::rank::RawDocument; +use crate::criterion::Criterion; +use crate::RawDocument; const MAX_DISTANCE: u16 = 8; diff --git a/src/data/doc_ids.rs b/meilidb-core/src/data/doc_ids.rs similarity index 100% rename from src/data/doc_ids.rs rename to meilidb-core/src/data/doc_ids.rs diff --git a/src/data/doc_indexes.rs b/meilidb-core/src/data/doc_indexes.rs similarity index 100% rename from src/data/doc_indexes.rs rename to meilidb-core/src/data/doc_indexes.rs diff --git a/src/data/mod.rs b/meilidb-core/src/data/mod.rs similarity index 100% rename from src/data/mod.rs rename to meilidb-core/src/data/mod.rs diff --git a/src/data/shared_data.rs b/meilidb-core/src/data/shared_data.rs similarity index 74% rename from src/data/shared_data.rs rename to meilidb-core/src/data/shared_data.rs index 100f837f7..fd505c6d9 100644 --- a/src/data/shared_data.rs +++ b/meilidb-core/src/data/shared_data.rs @@ -1,9 +1,9 @@ use std::sync::Arc; use std::ops::Deref; -#[derive(Default, Clone)] +#[derive(Clone)] pub struct SharedData { - pub bytes: Arc>, + pub bytes: Arc<[u8]>, pub offset: usize, pub len: usize, } @@ -15,7 +15,7 @@ impl SharedData { SharedData::new(bytes, 0, len) } - pub fn new(bytes: Arc>, offset: usize, len: usize) -> SharedData { + pub fn new(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedData { SharedData { bytes, offset, len } } @@ -33,6 +33,16 @@ impl SharedData { } } +impl Default for SharedData { + fn default() -> SharedData { + SharedData { + bytes: Arc::from(Vec::new()), + offset: 0, + len: 0, + } + } +} + impl Deref for SharedData { type Target = [u8]; diff --git a/src/rank/distinct_map.rs b/meilidb-core/src/distinct_map.rs similarity index 100% rename from src/rank/distinct_map.rs rename to meilidb-core/src/distinct_map.rs diff --git a/src/database/index.rs b/meilidb-core/src/index.rs similarity index 100% rename from src/database/index.rs rename to meilidb-core/src/index.rs diff --git a/src/rank/mod.rs b/meilidb-core/src/lib.rs similarity index 61% rename from src/rank/mod.rs rename to meilidb-core/src/lib.rs index f5b07d27d..18e9a99cc 100644 --- a/src/rank/mod.rs +++ b/meilidb-core/src/lib.rs @@ -1,15 +1,117 @@ pub mod criterion; +pub mod data; +mod index; +mod automaton; mod query_builder; mod distinct_map; +pub mod shared_data_cursor; +pub mod write_to_bytes; + use std::sync::Arc; +use serde::{Serialize, Deserialize}; use slice_group_by::GroupBy; use rayon::slice::ParallelSliceMut; -use crate::{Match, DocumentId}; +pub use self::index::{Index, IndexBuilder}; +pub use self::query_builder::{QueryBuilder, DistinctQueryBuilder}; -pub use self::query_builder::{FilterFunc, QueryBuilder, DistinctQueryBuilder}; +/// Represent an internally generated document unique identifier. +/// +/// It is used to inform the database the document you want to deserialize. +/// Helpful for custom ranking. +#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] +pub struct DocumentId(pub u64); + +/// This structure represent the position of a word +/// in a document and its attributes. +/// +/// This is stored in the map, generated at index time, +/// extracted and interpreted at search time. +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +#[repr(C)] +pub struct DocIndex { + /// The document identifier where the word was found. + pub document_id: DocumentId, + + /// The attribute in the document where the word was found + /// along with the index in it. + pub attribute: u16, + pub word_index: u16, + + /// The position in bytes where the word was found + /// along with the length of it. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. + pub char_index: u16, + pub char_length: u16, +} + +/// This structure represent a matching word with informations +/// on the location of the word in the document. +/// +/// The order of the field is important because it defines +/// the way these structures are ordered between themselves. +/// +/// The word in itself is not important. +// TODO do data oriented programming ? very arrays ? +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct Match { + /// The word index in the query sentence. + /// Same as the `attribute_index` but for the query words. + /// + /// Used to retrieve the automaton that match this word. + pub query_index: u32, + + /// The distance the word has with the query word + /// (i.e. the Levenshtein distance). + pub distance: u8, + + /// The attribute in the document where the word was found + /// along with the index in it. + pub attribute: u16, + pub word_index: u16, + + /// Whether the word that match is an exact match or a prefix. + pub is_exact: bool, + + /// The position in bytes where the word was found + /// along with the length of it. + /// + /// It informs on the original word area in the text indexed + /// without needing to run the tokenizer again. + pub char_index: u16, + pub char_length: u16, +} + +impl Match { + pub fn zero() -> Self { + Match { + query_index: 0, + distance: 0, + attribute: 0, + word_index: 0, + is_exact: false, + char_index: 0, + char_length: 0, + } + } + + pub fn max() -> Self { + Match { + query_index: u32::max_value(), + distance: u8::max_value(), + attribute: u16::max_value(), + word_index: u16::max_value(), + is_exact: true, + char_index: u16::max_value(), + char_length: u16::max_value(), + } + } +} #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Document { @@ -181,3 +283,15 @@ impl Matches { } } } + + +#[cfg(test)] +mod tests { + use super::*; + use std::mem; + + #[test] + fn docindex_mem_size() { + assert_eq!(mem::size_of::(), 16); + } +} diff --git a/src/rank/query_builder.rs b/meilidb-core/src/query_builder.rs similarity index 84% rename from src/rank/query_builder.rs rename to meilidb-core/src/query_builder.rs index 6b145b493..ad7de9c15 100644 --- a/src/rank/query_builder.rs +++ b/meilidb-core/src/query_builder.rs @@ -1,53 +1,27 @@ -use std::{cmp, mem}; -use std::ops::Range; -use std::time::Instant; use std::hash::Hash; +use std::ops::{Range, Deref}; use std::rc::Rc; +use std::time::Instant; +use std::{cmp, mem}; use rayon::slice::ParallelSliceMut; -use slice_group_by::{GroupByMut, LinearStrGroupBy}; +use slice_group_by::GroupByMut; +use meilidb_tokenizer::{is_cjk, split_query_string}; use hashbrown::{HashMap, HashSet}; use fst::Streamer; use log::info; use crate::automaton::{self, DfaExt, AutomatonExt}; -use crate::rank::distinct_map::{DistinctMap, BufferedDistinctMap}; -use crate::rank::criterion::Criteria; -use crate::database::Index; -use crate::rank::{raw_documents_from_matches, RawDocument, Document}; -use crate::{is_cjk, Match, DocumentId}; +use crate::distinct_map::{DistinctMap, BufferedDistinctMap}; +use crate::criterion::Criteria; +use crate::{raw_documents_from_matches, RawDocument, Document}; +use crate::{Index, Match, DocumentId}; -#[derive(Debug, PartialEq, Eq)] -enum CharCategory { - Space, - Cjk, - Other, -} - -fn classify_char(c: char) -> CharCategory { - if c.is_whitespace() { CharCategory::Space } - else if is_cjk(c) { CharCategory::Cjk } - else { CharCategory::Other } -} - -fn is_word(s: &&str) -> bool { - !s.chars().any(char::is_whitespace) -} - -fn same_group_category(a: char, b: char) -> bool { - let ca = classify_char(a); - let cb = classify_char(b); - if ca == CharCategory::Cjk || cb == CharCategory::Cjk { false } else { ca == cb } -} - -fn split_whitespace_automatons(query: &str) -> Vec { +fn generate_automatons(query: &str) -> Vec { let has_end_whitespace = query.chars().last().map_or(false, char::is_whitespace); - let mut groups = LinearStrGroupBy::new(query, same_group_category) - .filter(is_word) - .map(str::to_lowercase) - .peekable(); - + let mut groups = split_query_string(query).map(str::to_lowercase).peekable(); let mut automatons = Vec::new(); + while let Some(word) = groups.next() { let has_following_word = groups.peek().is_some(); let lev = if has_following_word || has_end_whitespace || word.chars().all(is_cjk) { @@ -61,28 +35,26 @@ fn split_whitespace_automatons(query: &str) -> Vec { automatons } -pub type FilterFunc = fn(DocumentId) -> bool; - -pub struct QueryBuilder<'i, 'c, FI> { - index: &'i Index, +pub struct QueryBuilder<'c, I, FI = fn(DocumentId) -> bool> { + index: I, criteria: Criteria<'c>, searchable_attrs: Option>, filter: Option, } -impl<'i, 'c> QueryBuilder<'i, 'c, FilterFunc> { - pub fn new(index: &'i Index) -> Self { +impl<'c, I> QueryBuilder<'c, I, fn(DocumentId) -> bool> { + pub fn new(index: I) -> Self { QueryBuilder::with_criteria(index, Criteria::default()) } - pub fn with_criteria(index: &'i Index, criteria: Criteria<'c>) -> Self { + pub fn with_criteria(index: I, criteria: Criteria<'c>) -> Self { QueryBuilder { index, criteria, searchable_attrs: None, filter: None } } } -impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> +impl<'c, I, FI> QueryBuilder<'c, I, FI> { - pub fn with_filter(self, function: F) -> QueryBuilder<'i, 'c, F> + pub fn with_filter(self, function: F) -> QueryBuilder<'c, I, F> where F: Fn(DocumentId) -> bool, { QueryBuilder { @@ -93,7 +65,7 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> } } - pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'i, 'c, FI, F> + pub fn with_distinct(self, function: F, size: usize) -> DistinctQueryBuilder<'c, I, FI, F> where F: Fn(DocumentId) -> Option, K: Hash + Eq, { @@ -108,9 +80,13 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> let attributes = self.searchable_attrs.get_or_insert_with(HashSet::new); attributes.insert(attribute); } +} +impl<'c, I, FI> QueryBuilder<'c, I, FI> +where I: Deref, +{ fn query_all(&self, query: &str) -> Vec { - let automatons = split_whitespace_automatons(query); + let automatons = generate_automatons(query); let mut stream = { let mut op_builder = fst::map::OpBuilder::new(); @@ -118,7 +94,7 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> let stream = self.index.map.search(automaton); op_builder.push(stream); } - op_builder.union() + op_builder.r#union() }; let mut matches = Vec::new(); @@ -159,8 +135,9 @@ impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> } } -impl<'i, 'c, FI> QueryBuilder<'i, 'c, FI> -where FI: Fn(DocumentId) -> bool, +impl<'c, I, FI> QueryBuilder<'c, I, FI> +where I: Deref, + FI: Fn(DocumentId) -> bool, { pub fn query(self, query: &str, range: Range) -> Vec { // We delegate the filter work to the distinct query builder, @@ -212,15 +189,15 @@ where FI: Fn(DocumentId) -> bool, } } -pub struct DistinctQueryBuilder<'i, 'c, FI, FD> { - inner: QueryBuilder<'i, 'c, FI>, +pub struct DistinctQueryBuilder<'c, I, FI, FD> { + inner: QueryBuilder<'c, I, FI>, function: FD, size: usize, } -impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD> +impl<'c, I, FI, FD> DistinctQueryBuilder<'c, I, FI, FD> { - pub fn with_filter(self, function: F) -> DistinctQueryBuilder<'i, 'c, F, FD> + pub fn with_filter(self, function: F) -> DistinctQueryBuilder<'c, I, F, FD> where F: Fn(DocumentId) -> bool, { DistinctQueryBuilder { @@ -235,8 +212,9 @@ impl<'i, 'c, FI, FD> DistinctQueryBuilder<'i, 'c, FI, FD> } } -impl<'i, 'c, FI, FD, K> DistinctQueryBuilder<'i, 'c, FI, FD> -where FI: Fn(DocumentId) -> bool, +impl<'c, I, FI, FD, K> DistinctQueryBuilder<'c, I, FI, FD> +where I: Deref, + FI: Fn(DocumentId) -> bool, FD: Fn(DocumentId) -> Option, K: Hash + Eq, { diff --git a/src/shared_data_cursor.rs b/meilidb-core/src/shared_data_cursor.rs similarity index 90% rename from src/shared_data_cursor.rs rename to meilidb-core/src/shared_data_cursor.rs index 00d36884a..9eeac472f 100644 --- a/src/shared_data_cursor.rs +++ b/meilidb-core/src/shared_data_cursor.rs @@ -7,12 +7,12 @@ pub struct SharedDataCursor(Cursor); impl SharedDataCursor { pub fn from_bytes(bytes: Vec) -> SharedDataCursor { let len = bytes.len(); - let bytes = Arc::new(bytes); + let bytes = Arc::from(bytes); SharedDataCursor::from_shared_bytes(bytes, 0, len) } - pub fn from_shared_bytes(bytes: Arc>, offset: usize, len: usize) -> SharedDataCursor { + pub fn from_shared_bytes(bytes: Arc<[u8]>, offset: usize, len: usize) -> SharedDataCursor { let data = SharedData::new(bytes, offset, len); let cursor = Cursor::new(data); diff --git a/src/write_to_bytes.rs b/meilidb-core/src/write_to_bytes.rs similarity index 100% rename from src/write_to_bytes.rs rename to meilidb-core/src/write_to_bytes.rs diff --git a/meilidb-data/Cargo.toml b/meilidb-data/Cargo.toml new file mode 100644 index 000000000..6096e4ad3 --- /dev/null +++ b/meilidb-data/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "meilidb-data" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +arc-swap = "0.3.11" +bincode = "1.1.2" +byteorder = "1.3.1" +hashbrown = { version = "0.2.2", features = ["serde"] } +linked-hash-map = { version = "0.5.2", features = ["serde_impl"] } +meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } +ordered-float = { version = "1.0.2", features = ["serde"] } +sdset = "0.3.1" +serde = { version = "1.0.90", features = ["derive"] } +serde_json = { version = "1.0.39", features = ["preserve_order"] } +sled = "0.23.0" +toml = { version = "0.5.0", features = ["preserve_order"] } +deunicode = "1.0.0" + +[dependencies.rmp-serde] +git = "https://github.com/3Hren/msgpack-rust.git" +rev = "40b3d48" diff --git a/meilidb-data/src/database.rs b/meilidb-data/src/database.rs new file mode 100644 index 000000000..da43dd4bc --- /dev/null +++ b/meilidb-data/src/database.rs @@ -0,0 +1,464 @@ +use std::collections::HashSet; +use std::io::{self, Cursor, BufRead}; +use std::iter::FromIterator; +use std::path::Path; +use std::sync::Arc; +use std::{error, fmt}; + +use arc_swap::{ArcSwap, Lease}; +use byteorder::{ReadBytesExt, BigEndian}; +use hashbrown::HashMap; +use meilidb_core::criterion::Criteria; +use meilidb_core::QueryBuilder; +use meilidb_core::shared_data_cursor::{FromSharedDataCursor, SharedDataCursor}; +use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::{DocumentId, Index as WordIndex}; +use rmp_serde::decode::{Error as RmpError}; +use sdset::SetBuf; +use serde::de; +use sled::IVec; + +use crate::{Schema, SchemaAttr, RankedMap}; +use crate::serde::{extract_document_id, Serializer, Deserializer, SerializerError}; +use crate::indexer::Indexer; + +#[derive(Debug)] +pub enum Error { + SchemaDiffer, + SchemaMissing, + WordIndexMissing, + MissingDocumentId, + SledError(sled::Error), + BincodeError(bincode::Error), + SerializerError(SerializerError), +} + +impl From for Error { + fn from(error: sled::Error) -> Error { + Error::SledError(error) + } +} + +impl From for Error { + fn from(error: bincode::Error) -> Error { + Error::BincodeError(error) + } +} + +impl From for Error { + fn from(error: SerializerError) -> Error { + Error::SerializerError(error) + } +} + +impl fmt::Display for Error { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + use self::Error::*; + match self { + SchemaDiffer => write!(f, "schemas differ"), + SchemaMissing => write!(f, "this index does not have a schema"), + WordIndexMissing => write!(f, "this index does not have a word index"), + MissingDocumentId => write!(f, "document id is missing"), + SledError(e) => write!(f, "sled error; {}", e), + BincodeError(e) => write!(f, "bincode error; {}", e), + SerializerError(e) => write!(f, "serializer error; {}", e), + } + } +} + +impl error::Error for Error { } + +fn index_name(name: &str) -> Vec { + format!("index-{}", name).into_bytes() +} + +fn document_key(id: DocumentId, attr: SchemaAttr) -> Vec { + let DocumentId(document_id) = id; + let SchemaAttr(schema_attr) = attr; + + let mut bytes = Vec::new(); + bytes.extend_from_slice(b"document-"); + bytes.extend_from_slice(&document_id.to_be_bytes()[..]); + bytes.extend_from_slice(&schema_attr.to_be_bytes()[..]); + bytes +} + +trait CursorExt { + fn consume_if_eq(&mut self, needle: &[u8]) -> bool; +} + +impl> CursorExt for Cursor { + fn consume_if_eq(&mut self, needle: &[u8]) -> bool { + let position = self.position() as usize; + let slice = self.get_ref().as_ref(); + + if slice[position..].starts_with(needle) { + self.consume(needle.len()); + true + } else { + false + } + } +} + +fn extract_document_key(key: Vec) -> io::Result<(DocumentId, SchemaAttr)> { + let mut key = Cursor::new(key); + + if !key.consume_if_eq(b"document-") { + return Err(io::Error::from(io::ErrorKind::InvalidData)) + } + + let document_id = key.read_u64::().map(DocumentId)?; + let schema_attr = key.read_u16::().map(SchemaAttr)?; + + Ok((document_id, schema_attr)) +} + +#[derive(Clone)] +pub struct Database { + opened: Arc>>, + inner: sled::Db, +} + +impl Database { + pub fn start_default>(path: P) -> Result { + let inner = sled::Db::start_default(path)?; + let opened = Arc::new(ArcSwap::new(Arc::new(HashMap::new()))); + Ok(Database { opened, inner }) + } + + pub fn open_index(&self, name: &str) -> Result, Error> { + // check if the index was already opened + if let Some(raw_index) = self.opened.lease().get(name) { + return Ok(Some(Index(raw_index.clone()))) + } + + let raw_name = index_name(name); + if self.inner.tree_names().into_iter().any(|tn| tn == raw_name) { + let tree = self.inner.open_tree(raw_name)?; + let raw_index = RawIndex::from_raw(tree)?; + + self.opened.rcu(|opened| { + let mut opened = HashMap::clone(opened); + opened.insert(name.to_string(), raw_index.clone()); + opened + }); + + return Ok(Some(Index(raw_index))) + } + + Ok(None) + } + + pub fn create_index(&self, name: String, schema: Schema) -> Result { + match self.open_index(&name)? { + Some(index) => { + if index.schema() != &schema { + return Err(Error::SchemaDiffer); + } + + Ok(index) + }, + None => { + let raw_name = index_name(&name); + let tree = self.inner.open_tree(raw_name)?; + let raw_index = RawIndex::new_from_raw(tree, schema)?; + + self.opened.rcu(|opened| { + let mut opened = HashMap::clone(opened); + opened.insert(name.clone(), raw_index.clone()); + opened + }); + + Ok(Index(raw_index)) + }, + } + } +} + +#[derive(Clone)] +pub struct RawIndex { + schema: Schema, + word_index: Arc>, + ranked_map: Arc>, + inner: Arc, +} + +impl RawIndex { + fn from_raw(inner: Arc) -> Result { + let schema = { + let bytes = inner.get("schema")?; + let bytes = bytes.ok_or(Error::SchemaMissing)?; + Schema::read_from_bin(bytes.as_ref())? + }; + + let bytes = inner.get("word-index")?; + let bytes = bytes.ok_or(Error::WordIndexMissing)?; + let word_index = { + let len = bytes.len(); + let bytes: Arc<[u8]> = Into::into(bytes); + let mut cursor = SharedDataCursor::from_shared_bytes(bytes, 0, len); + + // TODO must handle this error + let word_index = WordIndex::from_shared_data_cursor(&mut cursor).unwrap(); + + Arc::new(ArcSwap::new(Arc::new(word_index))) + }; + + let ranked_map = { + let map = match inner.get("ranked-map")? { + Some(bytes) => bincode::deserialize(bytes.as_ref())?, + None => RankedMap::default(), + }; + + Arc::new(ArcSwap::new(Arc::new(map))) + }; + + Ok(RawIndex { schema, word_index, ranked_map, inner }) + } + + fn new_from_raw(inner: Arc, schema: Schema) -> Result { + let mut schema_bytes = Vec::new(); + schema.write_to_bin(&mut schema_bytes)?; + inner.set("schema", schema_bytes)?; + + let word_index = WordIndex::default(); + inner.set("word-index", word_index.into_bytes())?; + let word_index = Arc::new(ArcSwap::new(Arc::new(word_index))); + + let ranked_map = Arc::new(ArcSwap::new(Arc::new(RankedMap::default()))); + + Ok(RawIndex { schema, word_index, ranked_map, inner }) + } + + pub fn schema(&self) -> &Schema { + &self.schema + } + + pub fn word_index(&self) -> Lease> { + self.word_index.lease() + } + + pub fn ranked_map(&self) -> Lease> { + self.ranked_map.lease() + } + + pub fn update_word_index(&self, word_index: Arc) -> sled::Result<()> { + let data = word_index.into_bytes(); + self.inner.set("word-index", data).map(drop)?; + self.word_index.store(word_index); + + Ok(()) + } + + pub fn update_ranked_map(&self, ranked_map: Arc) -> sled::Result<()> { + let data = bincode::serialize(ranked_map.as_ref()).unwrap(); + self.inner.set("ranked-map", data).map(drop)?; + self.ranked_map.store(ranked_map); + + Ok(()) + } + + pub fn set_document_attribute( + &self, + id: DocumentId, + attr: SchemaAttr, + value: V, + ) -> Result, sled::Error> + where IVec: From, + { + let key = document_key(id, attr); + Ok(self.inner.set(key, value)?) + } + + pub fn get_document_attribute( + &self, + id: DocumentId, + attr: SchemaAttr + ) -> Result, sled::Error> + { + let key = document_key(id, attr); + Ok(self.inner.get(key)?) + } + + pub fn get_document_fields(&self, id: DocumentId) -> DocumentFieldsIter { + let start = document_key(id, SchemaAttr::min()); + let end = document_key(id, SchemaAttr::max()); + DocumentFieldsIter(self.inner.range(start..=end)) + } + + pub fn del_document_attribute( + &self, + id: DocumentId, + attr: SchemaAttr + ) -> Result, sled::Error> + { + let key = document_key(id, attr); + Ok(self.inner.del(key)?) + } +} + +pub struct DocumentFieldsIter<'a>(sled::Iter<'a>); + +impl<'a> Iterator for DocumentFieldsIter<'a> { + type Item = Result<(DocumentId, SchemaAttr, IVec), Error>; + + fn next(&mut self) -> Option { + match self.0.next() { + Some(Ok((key, value))) => { + let (id, attr) = extract_document_key(key).unwrap(); + Some(Ok((id, attr, value))) + }, + Some(Err(e)) => Some(Err(Error::SledError(e))), + None => None, + } + } +} + +#[derive(Clone)] +pub struct Index(RawIndex); + +impl Index { + pub fn query_builder(&self) -> QueryBuilder>> { + let word_index = self.word_index(); + QueryBuilder::new(word_index) + } + + pub fn query_builder_with_criteria<'c>( + &self, + criteria: Criteria<'c>, + ) -> QueryBuilder<'c, Lease>> + { + let word_index = self.word_index(); + QueryBuilder::with_criteria(word_index, criteria) + } + + pub fn schema(&self) -> &Schema { + self.0.schema() + } + + pub fn word_index(&self) -> Lease> { + self.0.word_index() + } + + pub fn ranked_map(&self) -> Lease> { + self.0.ranked_map() + } + + pub fn documents_addition(&self) -> DocumentsAddition { + let index = self.0.clone(); + let ranked_map = self.0.ranked_map().clone(); + DocumentsAddition::from_raw(index, ranked_map) + } + + pub fn documents_deletion(&self) -> DocumentsDeletion { + let index = self.0.clone(); + DocumentsDeletion::from_raw(index) + } + + pub fn document( + &self, + fields: Option<&HashSet<&str>>, + id: DocumentId, + ) -> Result, RmpError> + where T: de::DeserializeOwned, + { + let fields = match fields { + Some(fields) => { + let iter = fields.iter().filter_map(|n| self.0.schema().attribute(n)); + Some(HashSet::from_iter(iter)) + }, + None => None, + }; + + let mut deserializer = Deserializer { + document_id: id, + raw_index: &self.0, + fields: fields.as_ref(), + }; + + // TODO: currently we return an error if all document fields are missing, + // returning None would have been better + T::deserialize(&mut deserializer).map(Some) + } +} + +pub struct DocumentsAddition { + inner: RawIndex, + indexer: Indexer, + ranked_map: RankedMap, +} + +impl DocumentsAddition { + pub fn from_raw(inner: RawIndex, ranked_map: RankedMap) -> DocumentsAddition { + DocumentsAddition { inner, indexer: Indexer::new(), ranked_map } + } + + pub fn update_document(&mut self, document: D) -> Result<(), Error> + where D: serde::Serialize, + { + let schema = self.inner.schema(); + let identifier = schema.identifier_name(); + + let document_id = match extract_document_id(identifier, &document)? { + Some(id) => id, + None => return Err(Error::MissingDocumentId), + }; + + let serializer = Serializer { + schema, + index: &self.inner, + indexer: &mut self.indexer, + ranked_map: &mut self.ranked_map, + document_id, + }; + + document.serialize(serializer)?; + + Ok(()) + } + pub fn finalize(self) -> sled::Result<()> { + let delta_index = self.indexer.build(); + + let index = self.inner.word_index(); + let new_index = index.r#union(&delta_index); + + let new_index = Arc::from(new_index); + self.inner.update_word_index(new_index)?; + + Ok(()) + } +} + +pub struct DocumentsDeletion { + inner: RawIndex, + documents: Vec, +} + +impl DocumentsDeletion { + pub fn from_raw(inner: RawIndex) -> DocumentsDeletion { + DocumentsDeletion { + inner, + documents: Vec::new(), + } + } + + pub fn delete_document(&mut self, id: DocumentId) { + self.documents.push(id); + } + + pub fn finalize(mut self) -> Result<(), Error> { + self.documents.sort_unstable(); + self.documents.dedup(); + + let idset = SetBuf::new_unchecked(self.documents); + let index = self.inner.word_index(); + + let new_index = index.remove_documents(&idset); + let new_index = Arc::from(new_index); + + self.inner.update_word_index(new_index)?; + + Ok(()) + } +} diff --git a/meilidb-data/src/index_event.rs b/meilidb-data/src/index_event.rs new file mode 100644 index 000000000..40d54cbf3 --- /dev/null +++ b/meilidb-data/src/index_event.rs @@ -0,0 +1,45 @@ +use std::error::Error; + +use byteorder::{ReadBytesExt, WriteBytesExt}; + +use meilidb_core::{Index as WordIndex}; +use meilidb_core::data::DocIds; +use meilidb_core::write_to_bytes::WriteToBytes; +use meilidb_core::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; + +enum NewIndexEvent<'a> { + RemovedDocuments(&'a DocIds), + UpdatedDocuments(&'a WordIndex), +} + +impl<'a> WriteToBytes for NewIndexEvent<'a> { + fn write_to_bytes(&self, bytes: &mut Vec) { + match self { + NewIndexEvent::RemovedDocuments(doc_ids) => { + let _ = bytes.write_u8(0); + doc_ids.write_to_bytes(bytes); + }, + NewIndexEvent::UpdatedDocuments(index) => { + let _ = bytes.write_u8(1); + index.write_to_bytes(bytes); + } + } + } +} + +enum IndexEvent { + RemovedDocuments(DocIds), + UpdatedDocuments(WordIndex), +} + +impl FromSharedDataCursor for IndexEvent { + type Error = Box; + + fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { + match cursor.read_u8()? { + 0 => DocIds::from_shared_data_cursor(cursor).map(IndexEvent::RemovedDocuments), + 1 => WordIndex::from_shared_data_cursor(cursor).map(IndexEvent::UpdatedDocuments), + _ => Err("invalid index event type".into()), + } + } +} diff --git a/meilidb-data/src/indexer.rs b/meilidb-data/src/indexer.rs new file mode 100644 index 000000000..3cfd8f722 --- /dev/null +++ b/meilidb-data/src/indexer.rs @@ -0,0 +1,117 @@ +use std::collections::BTreeMap; +use std::convert::TryFrom; + +use deunicode::deunicode_with_tofu; +use meilidb_core::{DocumentId, DocIndex}; +use meilidb_core::{Index as WordIndex, IndexBuilder as WordIndexBuilder}; +use meilidb_tokenizer::{is_cjk, Tokenizer, SeqTokenizer, Token}; +use sdset::Set; + +use crate::SchemaAttr; + +type Word = Vec; // TODO make it be a SmallVec + +pub struct Indexer { + word_limit: usize, // the maximum number of indexed words + indexed: BTreeMap>, +} + +impl Indexer { + pub fn new() -> Indexer { + Indexer { + word_limit: 1000, + indexed: BTreeMap::new(), + } + } + + pub fn with_word_limit(limit: usize) -> Indexer { + Indexer { + word_limit: limit, + indexed: BTreeMap::new(), + } + } + + pub fn index_text(&mut self, id: DocumentId, attr: SchemaAttr, text: &str) { + for token in Tokenizer::new(text) { + let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed); + if !must_continue { break } + } + } + + pub fn index_text_seq<'a, I>(&mut self, id: DocumentId, attr: SchemaAttr, iter: I) + where I: IntoIterator, + { + let iter = iter.into_iter(); + for token in SeqTokenizer::new(iter) { + let must_continue = index_token(token, id, attr, self.word_limit, &mut self.indexed); + if !must_continue { break } + } + } + + pub fn build(self) -> WordIndex { + let mut builder = WordIndexBuilder::new(); + + for (key, mut indexes) in self.indexed { + indexes.sort_unstable(); + indexes.dedup(); + + let indexes = Set::new_unchecked(&indexes); + builder.insert(key, indexes).unwrap(); + } + + builder.build() + } +} + +fn index_token( + token: Token, + id: DocumentId, + attr: SchemaAttr, + word_limit: usize, + indexed: &mut BTreeMap>, +) -> bool +{ + if token.word_index >= word_limit { return false } + + let lower = token.word.to_lowercase(); + let token = Token { word: &lower, ..token }; + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + indexed.entry(word).or_insert_with(Vec::new).push(docindex); + }, + None => return false, + } + + if !lower.contains(is_cjk) { + let unidecoded = deunicode_with_tofu(&lower, ""); + if unidecoded != lower { + let token = Token { word: &unidecoded, ..token }; + match token_to_docindex(id, attr, token) { + Some(docindex) => { + let word = Vec::from(token.word); + indexed.entry(word).or_insert_with(Vec::new).push(docindex); + }, + None => return false, + } + } + } + + true +} + +fn token_to_docindex(id: DocumentId, attr: SchemaAttr, token: Token) -> Option { + let word_index = u16::try_from(token.word_index).ok()?; + let char_index = u16::try_from(token.char_index).ok()?; + let char_length = u16::try_from(token.word.chars().count()).ok()?; + + let docindex = DocIndex { + document_id: id, + attribute: attr.0, + word_index: word_index, + char_index: char_index, + char_length: char_length, + }; + + Some(docindex) +} diff --git a/meilidb-data/src/lib.rs b/meilidb-data/src/lib.rs new file mode 100644 index 000000000..542741171 --- /dev/null +++ b/meilidb-data/src/lib.rs @@ -0,0 +1,12 @@ +mod database; +mod index_event; +mod indexer; +mod number; +mod ranked_map; +mod serde; +pub mod schema; + +pub use self::database::{Database, Index}; +pub use self::number::Number; +pub use self::ranked_map::RankedMap; +pub use self::schema::{Schema, SchemaAttr}; diff --git a/meilidb-data/src/number.rs b/meilidb-data/src/number.rs new file mode 100644 index 000000000..5e64cc78f --- /dev/null +++ b/meilidb-data/src/number.rs @@ -0,0 +1,55 @@ +use std::num::{ParseIntError, ParseFloatError}; +use std::str::FromStr; +use std::fmt; + +use ordered_float::OrderedFloat; +use serde::{Serialize, Deserialize}; + +#[derive(Serialize, Deserialize)] +#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub enum Number { + Unsigned(u64), + Signed(i64), + Float(OrderedFloat), +} + +impl FromStr for Number { + type Err = ParseNumberError; + + fn from_str(s: &str) -> Result { + let uint_error = match u64::from_str(s) { + Ok(unsigned) => return Ok(Number::Unsigned(unsigned)), + Err(error) => error, + }; + + let int_error = match i64::from_str(s) { + Ok(signed) => return Ok(Number::Signed(signed)), + Err(error) => error, + }; + + let float_error = match f64::from_str(s) { + Ok(float) => return Ok(Number::Float(OrderedFloat(float))), + Err(error) => error, + }; + + Err(ParseNumberError { uint_error, int_error, float_error }) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ParseNumberError { + uint_error: ParseIntError, + int_error: ParseIntError, + float_error: ParseFloatError, +} + +impl fmt::Display for ParseNumberError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + if self.uint_error == self.int_error { + write!(f, "can not parse number: {}, {}", self.uint_error, self.float_error) + } else { + write!(f, "can not parse number: {}, {}, {}", + self.uint_error, self.int_error, self.float_error) + } + } +} diff --git a/meilidb-data/src/ranked_map.rs b/meilidb-data/src/ranked_map.rs new file mode 100644 index 000000000..7b4ff3735 --- /dev/null +++ b/meilidb-data/src/ranked_map.rs @@ -0,0 +1,5 @@ +use hashbrown::HashMap; +use meilidb_core::DocumentId; +use crate::{SchemaAttr, Number}; + +pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; diff --git a/src/database/schema.rs b/meilidb-data/src/schema.rs similarity index 94% rename from src/database/schema.rs rename to meilidb-data/src/schema.rs index fc64ffccc..bff7806dc 100644 --- a/src/database/schema.rs +++ b/meilidb-data/src/schema.rs @@ -5,13 +5,9 @@ use std::{fmt, u16}; use std::ops::BitOr; use std::sync::Arc; -use serde_derive::{Serialize, Deserialize}; +use serde::{Serialize, Deserialize}; use linked_hash_map::LinkedHashMap; -use crate::database::serde::find_id::FindDocumentIdSerializer; -use crate::database::serde::SerializerError; -use crate::DocumentId; - pub const STORED: SchemaProps = SchemaProps { stored: true, indexed: false, ranked: false }; pub const INDEXED: SchemaProps = SchemaProps { stored: false, indexed: true, ranked: false }; pub const RANKED: SchemaProps = SchemaProps { stored: false, indexed: false, ranked: true }; @@ -166,14 +162,6 @@ impl Schema { attributes } - pub fn document_id(&self, document: T) -> Result - where T: serde::Serialize, - { - let id_attribute_name = &self.inner.identifier; - let serializer = FindDocumentIdSerializer { id_attribute_name }; - document.serialize(serializer) - } - pub fn props(&self, attr: SchemaAttr) -> SchemaProps { let (_, props) = self.inner.props[attr.0 as usize]; props diff --git a/src/database/serde/key_to_string.rs b/meilidb-data/src/serde/convert_to_number.rs similarity index 51% rename from src/database/serde/key_to_string.rs rename to meilidb-data/src/serde/convert_to_number.rs index 2fe0c5a39..cf30e3b62 100644 --- a/src/database/serde/key_to_string.rs +++ b/meilidb-data/src/serde/convert_to_number.rs @@ -1,12 +1,16 @@ -use serde::Serialize; +use std::str::FromStr; + +use ordered_float::OrderedFloat; use serde::ser; +use serde::Serialize; -use crate::database::serde::SerializerError; +use super::SerializerError; +use crate::Number; -pub struct KeyToStringSerializer; +pub struct ConvertToNumber; -impl ser::Serializer for KeyToStringSerializer { - type Ok = String; +impl ser::Serializer for ConvertToNumber { + type Ok = Number; type Error = SerializerError; type SerializeSeq = ser::Impossible; type SerializeTuple = ser::Impossible; @@ -16,48 +20,78 @@ impl ser::Serializer for KeyToStringSerializer { type SerializeStruct = ser::Impossible; type SerializeStructVariant = ser::Impossible; - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, + fn serialize_bool(self, value: bool) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, + fn serialize_char(self, value: char) -> Result { + Err(SerializerError::UnrankableType { type_name: "char" }) + } - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, + fn serialize_i8(self, value: i8) -> Result { + Ok(Number::Signed(i64::from(value))) + } - f32 => serialize_f32, - f64 => serialize_f64, + fn serialize_i16(self, value: i16) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i32(self, value: i32) -> Result { + Ok(Number::Signed(i64::from(value))) + } + + fn serialize_i64(self, value: i64) -> Result { + Ok(Number::Signed(value)) + } + + fn serialize_u8(self, value: u8) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u16(self, value: u16) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u32(self, value: u32) -> Result { + Ok(Number::Unsigned(u64::from(value))) + } + + fn serialize_u64(self, value: u64) -> Result { + Ok(Number::Unsigned(value)) + } + + fn serialize_f32(self, value: f32) -> Result { + Ok(Number::Float(OrderedFloat(value as f64))) + } + + fn serialize_f64(self, value: f64) -> Result { + Ok(Number::Float(OrderedFloat(value))) } fn serialize_str(self, value: &str) -> Result { - Ok(value.to_string()) + Ok(Number::from_str(value)?) } fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) + Err(SerializerError::UnrankableType { type_name: "&[u8]" }) } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnrankableType { type_name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnrankableType { type_name: "Option" }) } fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) + Err(SerializerError::UnrankableType { type_name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) + Err(SerializerError::UnrankableType { type_name: "unit struct" }) } fn serialize_unit_variant( @@ -67,7 +101,7 @@ impl ser::Serializer for KeyToStringSerializer { _variant: &'static str ) -> Result { - Err(SerializerError::UnserializableType { name: "unit variant" }) + Err(SerializerError::UnrankableType { type_name: "unit variant" }) } fn serialize_newtype_struct( @@ -89,15 +123,15 @@ impl ser::Serializer for KeyToStringSerializer { ) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "newtype variant" }) + Err(SerializerError::UnrankableType { type_name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) + Err(SerializerError::UnrankableType { type_name: "sequence" }) } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) + Err(SerializerError::UnrankableType { type_name: "tuple" }) } fn serialize_tuple_struct( @@ -106,7 +140,7 @@ impl ser::Serializer for KeyToStringSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple struct" }) + Err(SerializerError::UnrankableType { type_name: "tuple struct" }) } fn serialize_tuple_variant( @@ -117,11 +151,11 @@ impl ser::Serializer for KeyToStringSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple variant" }) + Err(SerializerError::UnrankableType { type_name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) + Err(SerializerError::UnrankableType { type_name: "map" }) } fn serialize_struct( @@ -130,7 +164,7 @@ impl ser::Serializer for KeyToStringSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct" }) + Err(SerializerError::UnrankableType { type_name: "struct" }) } fn serialize_struct_variant( @@ -141,6 +175,6 @@ impl ser::Serializer for KeyToStringSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct variant" }) + Err(SerializerError::UnrankableType { type_name: "struct variant" }) } } diff --git a/src/database/serde/value_to_number.rs b/meilidb-data/src/serde/convert_to_string.rs similarity index 66% rename from src/database/serde/value_to_number.rs rename to meilidb-data/src/serde/convert_to_string.rs index a70b92fc4..67e592e78 100644 --- a/src/database/serde/value_to_number.rs +++ b/meilidb-data/src/serde/convert_to_string.rs @@ -1,15 +1,12 @@ -use std::str::FromStr; - use serde::Serialize; -use serde::{ser, ser::Error}; +use serde::ser; -use crate::database::serde::SerializerError; -use crate::database::Number; +use super::SerializerError; -pub struct ValueToNumberSerializer; +pub struct ConvertToString; -impl ser::Serializer for ValueToNumberSerializer { - type Ok = Number; +impl ser::Serializer for ConvertToString { + type Ok = String; type Error = SerializerError; type SerializeSeq = ser::Impossible; type SerializeTuple = ser::Impossible; @@ -19,75 +16,78 @@ impl ser::Serializer for ValueToNumberSerializer { type SerializeStruct = ser::Impossible; type SerializeStructVariant = ser::Impossible; - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, + fn serialize_bool(self, value: bool) -> Result { + Err(SerializerError::UnserializableType { type_name: "boolean" }) + } + + fn serialize_char(self, value: char) -> Result { + Ok(value.to_string()) } fn serialize_i8(self, value: i8) -> Result { - Ok(Number::Signed(value as i64)) + Ok(value.to_string()) } fn serialize_i16(self, value: i16) -> Result { - Ok(Number::Signed(value as i64)) + Ok(value.to_string()) } fn serialize_i32(self, value: i32) -> Result { - Ok(Number::Signed(value as i64)) + Ok(value.to_string()) } fn serialize_i64(self, value: i64) -> Result { - Ok(Number::Signed(value as i64)) + Ok(value.to_string()) } fn serialize_u8(self, value: u8) -> Result { - Ok(Number::Unsigned(value as u64)) + Ok(value.to_string()) } fn serialize_u16(self, value: u16) -> Result { - Ok(Number::Unsigned(value as u64)) + Ok(value.to_string()) } fn serialize_u32(self, value: u32) -> Result { - Ok(Number::Unsigned(value as u64)) + Ok(value.to_string()) } fn serialize_u64(self, value: u64) -> Result { - Ok(Number::Unsigned(value as u64)) + Ok(value.to_string()) } fn serialize_f32(self, value: f32) -> Result { - Ok(Number::Float(value as f64)) + Ok(value.to_string()) } fn serialize_f64(self, value: f64) -> Result { - Ok(Number::Float(value)) + Ok(value.to_string()) } fn serialize_str(self, value: &str) -> Result { - Number::from_str(value).map_err(SerializerError::custom) + Ok(value.to_string()) } fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) + Err(SerializerError::UnserializableType { type_name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) + Err(SerializerError::UnserializableType { type_name: "unit struct" }) } fn serialize_unit_variant( @@ -97,7 +97,7 @@ impl ser::Serializer for ValueToNumberSerializer { _variant: &'static str ) -> Result { - Err(SerializerError::UnserializableType { name: "unit variant" }) + Err(SerializerError::UnserializableType { type_name: "unit variant" }) } fn serialize_newtype_struct( @@ -119,15 +119,15 @@ impl ser::Serializer for ValueToNumberSerializer { ) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "newtype variant" }) + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) + Err(SerializerError::UnserializableType { type_name: "sequence" }) } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) + Err(SerializerError::UnserializableType { type_name: "tuple" }) } fn serialize_tuple_struct( @@ -136,7 +136,7 @@ impl ser::Serializer for ValueToNumberSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple struct" }) + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) } fn serialize_tuple_variant( @@ -147,11 +147,11 @@ impl ser::Serializer for ValueToNumberSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple variant" }) + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) + Err(SerializerError::UnserializableType { type_name: "map" }) } fn serialize_struct( @@ -160,7 +160,7 @@ impl ser::Serializer for ValueToNumberSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct" }) + Err(SerializerError::UnserializableType { type_name: "struct" }) } fn serialize_struct_variant( @@ -171,6 +171,6 @@ impl ser::Serializer for ValueToNumberSerializer { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct variant" }) + Err(SerializerError::UnserializableType { type_name: "struct variant" }) } } diff --git a/meilidb-data/src/serde/deserializer.rs b/meilidb-data/src/serde/deserializer.rs new file mode 100644 index 000000000..12873713b --- /dev/null +++ b/meilidb-data/src/serde/deserializer.rs @@ -0,0 +1,97 @@ +use std::collections::HashSet; +use std::io::Cursor; + +use meilidb_core::DocumentId; +use rmp_serde::decode::{Deserializer as RmpDeserializer, ReadReader}; +use rmp_serde::decode::{Error as RmpError}; +use serde::{de, forward_to_deserialize_any}; + +use crate::database::RawIndex; +use crate::SchemaAttr; + +pub struct Deserializer<'a> { + pub document_id: DocumentId, + pub raw_index: &'a RawIndex, + pub fields: Option<&'a HashSet>, +} + +impl<'de, 'a, 'b> de::Deserializer<'de> for &'b mut Deserializer<'a> +{ + type Error = RmpError; + + fn deserialize_any(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.deserialize_map(visitor) + } + + forward_to_deserialize_any! { + bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq + bytes byte_buf unit_struct tuple_struct + identifier tuple ignored_any option newtype_struct enum struct + } + + fn deserialize_map(self, visitor: V) -> Result + where V: de::Visitor<'de> + { + let document_attributes = self.raw_index.get_document_fields(self.document_id); + let document_attributes = document_attributes.filter_map(|result| { + match result { + Ok(value) => Some(value), + Err(e) => { + // TODO: must log the error + // error!("sled iter error; {}", e); + None + }, + } + }); + let iter = document_attributes.filter_map(|(_, attr, value)| { + if self.fields.map_or(true, |f| f.contains(&attr)) { + let attribute_name = self.raw_index.schema().attribute_name(attr); + Some((attribute_name, Value::new(value))) + } else { + None + } + }); + + let map_deserializer = de::value::MapDeserializer::new(iter); + visitor.visit_map(map_deserializer) + } +} + +struct Value(RmpDeserializer>>) where A: AsRef<[u8]>; + +impl Value where A: AsRef<[u8]> +{ + fn new(value: A) -> Value { + Value(RmpDeserializer::new(Cursor::new(value))) + } +} + +impl<'de, A> de::IntoDeserializer<'de, RmpError> for Value +where A: AsRef<[u8]>, +{ + type Deserializer = Self; + + fn into_deserializer(self) -> Self::Deserializer { + self + } +} + +impl<'de, 'a, A> de::Deserializer<'de> for Value +where A: AsRef<[u8]>, +{ + type Error = RmpError; + + fn deserialize_any(mut self, visitor: V) -> Result + where V: de::Visitor<'de> + { + self.0.deserialize_any(visitor) + } + + forward_to_deserialize_any! { + bool i8 i16 i32 i64 i128 u8 u16 u32 u64 u128 f32 f64 char str string + bytes byte_buf option unit unit_struct newtype_struct seq tuple + tuple_struct map struct enum identifier ignored_any + } +} diff --git a/src/database/serde/find_id.rs b/meilidb-data/src/serde/extract_document_id.rs similarity index 63% rename from src/database/serde/find_id.rs rename to meilidb-data/src/serde/extract_document_id.rs index 98e2e8036..d7c6bb195 100644 --- a/src/database/serde/find_id.rs +++ b/meilidb-data/src/serde/extract_document_id.rs @@ -1,23 +1,41 @@ +use std::collections::hash_map::DefaultHasher; +use std::hash::{Hash, Hasher}; + +use meilidb_core::DocumentId; use serde::Serialize; use serde::ser; -use crate::database::serde::key_to_string::KeyToStringSerializer; -use crate::database::serde::{SerializerError, calculate_hash}; -use crate::DocumentId; +use super::{SerializerError, ConvertToString}; -pub struct FindDocumentIdSerializer<'a> { - pub id_attribute_name: &'a str, +pub fn extract_document_id( + identifier: &str, + document: &D, +) -> Result, SerializerError> +where D: serde::Serialize, +{ + let serializer = ExtractDocumentId { identifier }; + document.serialize(serializer) } -impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { - type Ok = DocumentId; +fn calculate_hash(t: &T) -> u64 { + let mut s = DefaultHasher::new(); + t.hash(&mut s); + s.finish() +} + +struct ExtractDocumentId<'a> { + identifier: &'a str, +} + +impl<'a> ser::Serializer for ExtractDocumentId<'a> { + type Ok = Option; type Error = SerializerError; type SerializeSeq = ser::Impossible; type SerializeTuple = ser::Impossible; type SerializeTupleStruct = ser::Impossible; type SerializeTupleVariant = ser::Impossible; - type SerializeMap = FindDocumentIdMapSerializer<'a>; - type SerializeStruct = FindDocumentIdStructSerializer<'a>; + type SerializeMap = ExtractDocumentIdMapSerializer<'a>; + type SerializeStruct = ExtractDocumentIdStructSerializer<'a>; type SerializeStructVariant = ser::Impossible; forward_to_unserializable_type! { @@ -38,30 +56,30 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { f64 => serialize_f64, } - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) + fn serialize_str(self, value: &str) -> Result { + Err(SerializerError::UnserializableType { type_name: "str" }) } fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) } fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_some(self, _value: &T) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "Option" }) + Err(SerializerError::UnserializableType { type_name: "Option" }) } fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) + Err(SerializerError::UnserializableType { type_name: "()" }) } fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) + Err(SerializerError::UnserializableType { type_name: "unit struct" }) } fn serialize_unit_variant( @@ -71,7 +89,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { _variant: &'static str ) -> Result { - Err(SerializerError::UnserializableType { name: "unit variant" }) + Err(SerializerError::UnserializableType { type_name: "unit variant" }) } fn serialize_newtype_struct( @@ -93,15 +111,15 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { ) -> Result where T: Serialize, { - Err(SerializerError::UnserializableType { name: "newtype variant" }) + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) } fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) + Err(SerializerError::UnserializableType { type_name: "sequence" }) } fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) + Err(SerializerError::UnserializableType { type_name: "tuple" }) } fn serialize_tuple_struct( @@ -110,7 +128,7 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple struct" }) + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) } fn serialize_tuple_variant( @@ -121,15 +139,17 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "tuple variant" }) + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) } fn serialize_map(self, _len: Option) -> Result { - Ok(FindDocumentIdMapSerializer { - id_attribute_name: self.id_attribute_name, + let serializer = ExtractDocumentIdMapSerializer { + identifier: self.identifier, document_id: None, current_key_name: None, - }) + }; + + Ok(serializer) } fn serialize_struct( @@ -138,10 +158,12 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { _len: usize ) -> Result { - Ok(FindDocumentIdStructSerializer { - id_attribute_name: self.id_attribute_name, + let serializer = ExtractDocumentIdStructSerializer { + identifier: self.identifier, document_id: None, - }) + }; + + Ok(serializer) } fn serialize_struct_variant( @@ -152,24 +174,24 @@ impl<'a> ser::Serializer for FindDocumentIdSerializer<'a> { _len: usize ) -> Result { - Err(SerializerError::UnserializableType { name: "struct variant" }) + Err(SerializerError::UnserializableType { type_name: "struct variant" }) } } -pub struct FindDocumentIdMapSerializer<'a> { - id_attribute_name: &'a str, +pub struct ExtractDocumentIdMapSerializer<'a> { + identifier: &'a str, document_id: Option, current_key_name: Option, } -impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> { - type Ok = DocumentId; +impl<'a> ser::SerializeMap for ExtractDocumentIdMapSerializer<'a> { + type Ok = Option; type Error = SerializerError; fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> where T: Serialize, { - let key = key.serialize(KeyToStringSerializer)?; + let key = key.serialize(ConvertToString)?; self.current_key_name = Some(key); Ok(()) } @@ -188,9 +210,9 @@ impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> { ) -> Result<(), Self::Error> where K: Serialize, V: Serialize, { - let key = key.serialize(KeyToStringSerializer)?; + let key = key.serialize(ConvertToString)?; - if self.id_attribute_name == key { + if self.identifier == key { // TODO is it possible to have multiple ids? let id = bincode::serialize(value).unwrap(); let hash = calculate_hash(&id); @@ -201,20 +223,17 @@ impl<'a> ser::SerializeMap for FindDocumentIdMapSerializer<'a> { } fn end(self) -> Result { - match self.document_id { - Some(document_id) => Ok(document_id), - None => Err(SerializerError::DocumentIdNotFound) - } + Ok(self.document_id) } } -pub struct FindDocumentIdStructSerializer<'a> { - id_attribute_name: &'a str, +pub struct ExtractDocumentIdStructSerializer<'a> { + identifier: &'a str, document_id: Option, } -impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { - type Ok = DocumentId; +impl<'a> ser::SerializeStruct for ExtractDocumentIdStructSerializer<'a> { + type Ok = Option; type Error = SerializerError; fn serialize_field( @@ -224,7 +243,7 @@ impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { ) -> Result<(), Self::Error> where T: Serialize, { - if self.id_attribute_name == key { + if self.identifier == key { // TODO can it be possible to have multiple ids? let id = bincode::serialize(value).unwrap(); let hash = calculate_hash(&id); @@ -235,9 +254,6 @@ impl<'a> ser::SerializeStruct for FindDocumentIdStructSerializer<'a> { } fn end(self) -> Result { - match self.document_id { - Some(document_id) => Ok(document_id), - None => Err(SerializerError::DocumentIdNotFound) - } + Ok(self.document_id) } } diff --git a/meilidb-data/src/serde/indexer.rs b/meilidb-data/src/serde/indexer.rs new file mode 100644 index 000000000..8eb0b2c67 --- /dev/null +++ b/meilidb-data/src/serde/indexer.rs @@ -0,0 +1,337 @@ +use meilidb_core::DocumentId; +use serde::ser; +use serde::Serialize; + +use crate::database::RawIndex; +use crate::indexer::Indexer as RawIndexer; +use crate::schema::SchemaAttr; +use super::{SerializerError, ConvertToString}; + +pub struct Indexer<'a> { + pub attribute: SchemaAttr, + pub indexer: &'a mut RawIndexer, + pub document_id: DocumentId, +} + +impl<'a> ser::Serializer for Indexer<'a> { + type Ok = (); + type Error = SerializerError; + type SerializeSeq = SeqIndexer<'a>; + type SerializeTuple = TupleIndexer<'a>; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = MapIndexer<'a>; + type SerializeStruct = StructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + fn serialize_bool(self, value: bool) -> Result { + Err(SerializerError::UnindexableType { type_name: "boolean" }) + } + + fn serialize_char(self, value: char) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i8(self, value: i8) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i16(self, value: i16) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i32(self, value: i32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_i64(self, value: i64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u8(self, value: u8) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u16(self, value: u16) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u32(self, value: u32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_u64(self, value: u64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_f32(self, value: f32) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_f64(self, value: f64) -> Result { + let text = value.serialize(ConvertToString)?; + self.serialize_str(&text) + } + + fn serialize_str(self, text: &str) -> Result { + self.indexer.index_text(self.document_id, self.attribute, text); + Ok(()) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnindexableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnindexableType { type_name: "Option" }) + } + + fn serialize_some(self, value: &T) -> Result + where T: ser::Serialize, + { + let text = value.serialize(ConvertToString)?; + self.indexer.index_text(self.document_id, self.attribute, &text); + Ok(()) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnindexableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnindexableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: ser::Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnindexableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + let indexer = SeqIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) + } + + fn serialize_tuple(self, _len: usize) -> Result { + let indexer = TupleIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + let indexer = MapIndexer { + attribute: self.attribute, + document_id: self.document_id, + indexer: self.indexer, + texts: Vec::new(), + }; + + Ok(indexer) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "struct" }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnindexableType { type_name: "struct variant" }) + } +} + +pub struct SeqIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeSeq for SeqIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct MapIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeMap for MapIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let text = key.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct StructSerializer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeStruct for StructSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key_text = key.to_owned(); + let value_text = value.serialize(ConvertToString)?; + self.texts.push(key_text); + self.texts.push(value_text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} + +pub struct TupleIndexer<'a> { + attribute: SchemaAttr, + document_id: DocumentId, + indexer: &'a mut RawIndexer, + texts: Vec, +} + +impl<'a> ser::SerializeTuple for TupleIndexer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_element(&mut self, value: &T) -> Result<(), Self::Error> + where T: Serialize + { + let text = value.serialize(ConvertToString)?; + self.texts.push(text); + Ok(()) + } + + fn end(self) -> Result { + let texts = self.texts.iter().map(String::as_str); + self.indexer.index_text_seq(self.document_id, self.attribute, texts); + Ok(()) + } +} diff --git a/meilidb-data/src/serde/mod.rs b/meilidb-data/src/serde/mod.rs new file mode 100644 index 000000000..cf222c1bd --- /dev/null +++ b/meilidb-data/src/serde/mod.rs @@ -0,0 +1,97 @@ +macro_rules! forward_to_unserializable_type { + ($($ty:ident => $se_method:ident,)*) => { + $( + fn $se_method(self, _v: $ty) -> Result { + Err(SerializerError::UnserializableType { type_name: "$ty" }) + } + )* + } +} + +mod convert_to_number; +mod convert_to_string; +mod deserializer; +mod extract_document_id; +mod indexer; +mod serializer; + +pub use self::deserializer::Deserializer; +pub use self::extract_document_id::extract_document_id; +pub use self::convert_to_string::ConvertToString; +pub use self::convert_to_number::ConvertToNumber; +pub use self::indexer::Indexer; +pub use self::serializer::Serializer; + +use std::{fmt, error::Error}; +use rmp_serde::encode::Error as RmpError; +use serde::ser; +use crate::number::ParseNumberError; + +#[derive(Debug)] +pub enum SerializerError { + DocumentIdNotFound, + RmpError(RmpError), + SledError(sled::Error), + ParseNumberError(ParseNumberError), + UnserializableType { type_name: &'static str }, + UnindexableType { type_name: &'static str }, + UnrankableType { type_name: &'static str }, + Custom(String), +} + +impl ser::Error for SerializerError { + fn custom(msg: T) -> Self { + SerializerError::Custom(msg.to_string()) + } +} + +impl fmt::Display for SerializerError { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + match self { + SerializerError::DocumentIdNotFound => { + write!(f, "serialized document does not have an id according to the schema") + } + SerializerError::RmpError(e) => write!(f, "rmp serde related error: {}", e), + SerializerError::SledError(e) => write!(f, "sled related error: {}", e), + SerializerError::ParseNumberError(e) => { + write!(f, "error while trying to parse a number: {}", e) + }, + SerializerError::UnserializableType { type_name } => { + write!(f, "{} are not a serializable type", type_name) + }, + SerializerError::UnindexableType { type_name } => { + write!(f, "{} are not an indexable type", type_name) + }, + SerializerError::UnrankableType { type_name } => { + write!(f, "{} types can not be used for ranking", type_name) + }, + SerializerError::Custom(s) => f.write_str(s), + } + } +} + +impl Error for SerializerError {} + +impl From for SerializerError { + fn from(value: String) -> SerializerError { + SerializerError::Custom(value) + } +} + +impl From for SerializerError { + fn from(error: RmpError) -> SerializerError { + SerializerError::RmpError(error) + } +} + +impl From for SerializerError { + fn from(error: sled::Error) -> SerializerError { + SerializerError::SledError(error) + } +} + +impl From for SerializerError { + fn from(error: ParseNumberError) -> SerializerError { + SerializerError::ParseNumberError(error) + } +} diff --git a/meilidb-data/src/serde/serializer.rs b/meilidb-data/src/serde/serializer.rs new file mode 100644 index 000000000..37b3c7036 --- /dev/null +++ b/meilidb-data/src/serde/serializer.rs @@ -0,0 +1,295 @@ +use meilidb_core::DocumentId; +use serde::ser; + +use crate::database::RawIndex; +use crate::ranked_map::RankedMap; +use crate::indexer::Indexer as RawIndexer; +use crate::schema::{Schema, SchemaAttr}; +use super::{SerializerError, ConvertToString, ConvertToNumber, Indexer}; + +pub struct Serializer<'a> { + pub schema: &'a Schema, + pub index: &'a RawIndex, + pub indexer: &'a mut RawIndexer, + pub ranked_map: &'a mut RankedMap, + pub document_id: DocumentId, +} + +impl<'a> ser::Serializer for Serializer<'a> { + type Ok = (); + type Error = SerializerError; + type SerializeSeq = ser::Impossible; + type SerializeTuple = ser::Impossible; + type SerializeTupleStruct = ser::Impossible; + type SerializeTupleVariant = ser::Impossible; + type SerializeMap = MapSerializer<'a>; + type SerializeStruct = StructSerializer<'a>; + type SerializeStructVariant = ser::Impossible; + + forward_to_unserializable_type! { + bool => serialize_bool, + char => serialize_char, + + i8 => serialize_i8, + i16 => serialize_i16, + i32 => serialize_i32, + i64 => serialize_i64, + + u8 => serialize_u8, + u16 => serialize_u16, + u32 => serialize_u32, + u64 => serialize_u64, + + f32 => serialize_f32, + f64 => serialize_f64, + } + + fn serialize_str(self, _v: &str) -> Result { + Err(SerializerError::UnserializableType { type_name: "str" }) + } + + fn serialize_bytes(self, _v: &[u8]) -> Result { + Err(SerializerError::UnserializableType { type_name: "&[u8]" }) + } + + fn serialize_none(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_some(self, _value: &T) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnserializableType { type_name: "Option" }) + } + + fn serialize_unit(self) -> Result { + Err(SerializerError::UnserializableType { type_name: "()" }) + } + + fn serialize_unit_struct(self, _name: &'static str) -> Result { + Err(SerializerError::UnserializableType { type_name: "unit struct" }) + } + + fn serialize_unit_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "unit variant" }) + } + + fn serialize_newtype_struct( + self, + _name: &'static str, + value: &T + ) -> Result + where T: ser::Serialize, + { + value.serialize(self) + } + + fn serialize_newtype_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _value: &T + ) -> Result + where T: ser::Serialize, + { + Err(SerializerError::UnserializableType { type_name: "newtype variant" }) + } + + fn serialize_seq(self, _len: Option) -> Result { + Err(SerializerError::UnserializableType { type_name: "sequence" }) + } + + fn serialize_tuple(self, _len: usize) -> Result { + Err(SerializerError::UnserializableType { type_name: "tuple" }) + } + + fn serialize_tuple_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple struct" }) + } + + fn serialize_tuple_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "tuple variant" }) + } + + fn serialize_map(self, _len: Option) -> Result { + Ok(MapSerializer { + schema: self.schema, + document_id: self.document_id, + index: self.index, + indexer: self.indexer, + ranked_map: self.ranked_map, + current_key_name: None, + }) + } + + fn serialize_struct( + self, + _name: &'static str, + _len: usize + ) -> Result + { + Ok(StructSerializer { + schema: self.schema, + document_id: self.document_id, + index: self.index, + indexer: self.indexer, + ranked_map: self.ranked_map, + }) + } + + fn serialize_struct_variant( + self, + _name: &'static str, + _variant_index: u32, + _variant: &'static str, + _len: usize + ) -> Result + { + Err(SerializerError::UnserializableType { type_name: "struct variant" }) + } +} + +pub struct MapSerializer<'a> { + schema: &'a Schema, + document_id: DocumentId, + index: &'a RawIndex, + indexer: &'a mut RawIndexer, + ranked_map: &'a mut RankedMap, + current_key_name: Option, +} + +impl<'a> ser::SerializeMap for MapSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key = key.serialize(ConvertToString)?; + self.current_key_name = Some(key); + Ok(()) + } + + fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> + where T: ser::Serialize, + { + let key = self.current_key_name.take().unwrap(); + self.serialize_entry(&key, value) + } + + fn serialize_entry( + &mut self, + key: &K, + value: &V, + ) -> Result<(), Self::Error> + where K: ser::Serialize, V: ser::Serialize, + { + let key = key.serialize(ConvertToString)?; + + serialize_value( + self.schema, + self.document_id, + self.index, + self.indexer, + self.ranked_map, + &key, + value, + ) + } + + fn end(self) -> Result { + Ok(()) + } +} + +pub struct StructSerializer<'a> { + schema: &'a Schema, + document_id: DocumentId, + index: &'a RawIndex, + indexer: &'a mut RawIndexer, + ranked_map: &'a mut RankedMap, +} + +impl<'a> ser::SerializeStruct for StructSerializer<'a> { + type Ok = (); + type Error = SerializerError; + + fn serialize_field( + &mut self, + key: &'static str, + value: &T, + ) -> Result<(), Self::Error> + where T: ser::Serialize, + { + serialize_value( + self.schema, + self.document_id, + self.index, + self.indexer, + self.ranked_map, + key, + value, + ) + } + + fn end(self) -> Result { + Ok(()) + } +} + +fn serialize_value( + schema: &Schema, + document_id: DocumentId, + index: &RawIndex, + indexer: &mut RawIndexer, + ranked_map: &mut RankedMap, + key: &str, + value: &T, +) -> Result<(), SerializerError> +where T: ser::Serialize, +{ + if let Some(attr) = schema.attribute(key) { + let props = schema.props(attr); + + if props.is_stored() { + let value = rmp_serde::to_vec_named(value)?; + index.set_document_attribute(document_id, attr, value)?; + } + + if props.is_indexed() { + let indexer = Indexer { + attribute: attr, + indexer: indexer, + document_id: document_id, + }; + value.serialize(indexer)?; + } + + if props.is_ranked() { + let key = (document_id, attr); + let number = value.serialize(ConvertToNumber)?; + ranked_map.insert(key, number); + } + } + + Ok(()) +} diff --git a/meilidb-tokenizer/Cargo.toml b/meilidb-tokenizer/Cargo.toml new file mode 100644 index 000000000..32c9429b7 --- /dev/null +++ b/meilidb-tokenizer/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "meilidb-tokenizer" +version = "0.1.0" +authors = ["Kerollmops "] +edition = "2018" + +[dependencies] +slice-group-by = "0.2.4" diff --git a/meilidb-tokenizer/src/lib.rs b/meilidb-tokenizer/src/lib.rs new file mode 100644 index 000000000..88e389a46 --- /dev/null +++ b/meilidb-tokenizer/src/lib.rs @@ -0,0 +1,295 @@ +use std::iter::Peekable; +use slice_group_by::StrGroupBy; +use self::SeparatorCategory::*; + +pub fn is_cjk(c: char) -> bool { + (c >= '\u{2e80}' && c <= '\u{2eff}') || + (c >= '\u{2f00}' && c <= '\u{2fdf}') || + (c >= '\u{3040}' && c <= '\u{309f}') || + (c >= '\u{30a0}' && c <= '\u{30ff}') || + (c >= '\u{3100}' && c <= '\u{312f}') || + (c >= '\u{3200}' && c <= '\u{32ff}') || + (c >= '\u{3400}' && c <= '\u{4dbf}') || + (c >= '\u{4e00}' && c <= '\u{9fff}') || + (c >= '\u{f900}' && c <= '\u{faff}') +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum SeparatorCategory { + Soft, + Hard, +} + +impl SeparatorCategory { + fn merge(self, other: SeparatorCategory) -> SeparatorCategory { + if let (Soft, Soft) = (self, other) { Soft } else { Hard } + } + + fn to_usize(self) -> usize { + match self { + Soft => 1, + Hard => 8, + } + } +} + +fn is_separator(c: char) -> bool { + classify_separator(c).is_some() +} + +fn classify_separator(c: char) -> Option { + match c { + ' ' | '\'' | '"' => Some(Soft), + '.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Hard), + _ => None, + } +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +enum CharCategory { + Separator(SeparatorCategory), + Cjk, + Other, +} + +fn classify_char(c: char) -> CharCategory { + if let Some(category) = classify_separator(c) { + CharCategory::Separator(category) + } else if is_cjk(c) { + CharCategory::Cjk + } else { + CharCategory::Other + } +} + +fn is_str_word(s: &str) -> bool { + !s.chars().any(is_separator) +} + +fn same_group_category(a: char, b: char) -> bool { + match (classify_char(a), classify_char(b)) { + (CharCategory::Cjk, _) | (_, CharCategory::Cjk) => false, + (CharCategory::Separator(_), CharCategory::Separator(_)) => true, + (a, b) => a == b, + } +} + +// fold the number of chars along with the index position +fn chars_count_index((n, _): (usize, usize), (i, c): (usize, char)) -> (usize, usize) { + (n + 1, i + c.len_utf8()) +} + +pub fn split_query_string(query: &str) -> impl Iterator { + Tokenizer::new(query).map(|t| t.word) +} + +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +pub struct Token<'a> { + pub word: &'a str, + pub word_index: usize, + pub char_index: usize, +} + +pub struct Tokenizer<'a> { + inner: &'a str, + word_index: usize, + char_index: usize, +} + +impl<'a> Tokenizer<'a> { + pub fn new(string: &str) -> Tokenizer { + // skip every separator and set `char_index` + // to the number of char trimmed + let (count, index) = string.char_indices() + .take_while(|(_, c)| is_separator(*c)) + .fold((0, 0), chars_count_index); + + Tokenizer { + inner: &string[index..], + word_index: 0, + char_index: count, + } + } +} + +impl<'a> Iterator for Tokenizer<'a> { + type Item = Token<'a>; + + fn next(&mut self) -> Option { + let mut iter = self.inner.linear_group_by(same_group_category).peekable(); + + while let (Some(string), next_string) = (iter.next(), iter.peek()) { + let (count, index) = string.char_indices().fold((0, 0), chars_count_index); + + if !is_str_word(string) { + self.word_index += string.chars() + .filter_map(classify_separator) + .fold(Soft, |a, x| a.merge(x)) + .to_usize(); + self.char_index += count; + self.inner = &self.inner[index..]; + continue; + } + + let token = Token { + word: string, + word_index: self.word_index, + char_index: self.char_index, + }; + + if next_string.filter(|s| is_str_word(s)).is_some() { + self.word_index += 1; + } + + self.char_index += count; + self.inner = &self.inner[index..]; + + return Some(token); + } + + self.inner = ""; + None + } +} + +pub struct SeqTokenizer<'a, I> +where I: Iterator, +{ + inner: I, + current: Option>>, + word_offset: usize, + char_offset: usize, +} + +impl<'a, I> SeqTokenizer<'a, I> +where I: Iterator, +{ + pub fn new(mut iter: I) -> SeqTokenizer<'a, I> { + let current = iter.next().map(|s| Tokenizer::new(s).peekable()); + SeqTokenizer { + inner: iter, + current: current, + word_offset: 0, + char_offset: 0, + } + } +} + +impl<'a, I> Iterator for SeqTokenizer<'a, I> +where I: Iterator, +{ + type Item = Token<'a>; + + fn next(&mut self) -> Option { + match &mut self.current { + Some(current) => { + match current.next() { + Some(token) => { + // we must apply the word and char offsets + // to the token before returning it + let token = Token { + word: token.word, + word_index: token.word_index + self.word_offset, + char_index: token.char_index + self.char_offset, + }; + + // if this is the last iteration on this text + // we must save the offsets for next texts + if current.peek().is_none() { + let hard_space = SeparatorCategory::Hard.to_usize(); + self.word_offset = token.word_index + hard_space; + self.char_offset = token.char_index + hard_space; + } + + Some(token) + }, + None => { + // no more words in this text we must + // start tokenizing the next text + self.current = self.inner.next().map(|s| Tokenizer::new(s).peekable()); + self.next() + }, + } + }, + // no more texts available + None => None, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn easy() { + let mut tokenizer = Tokenizer::new("salut"); + + assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("yo "); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), None); + } + + #[test] + fn hard() { + let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 })); + assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); + assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); + assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 })); + assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 })); + assert_eq!(tokenizer.next(), None); + } + + #[test] + fn hard_long_chars() { + let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); + assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); + assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); + + assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); + assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 })); + assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 })); + assert_eq!(tokenizer.next(), None); + } + + #[test] + fn hard_kanjis() { + let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}"); + + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 })); + assert_eq!(tokenizer.next(), None); + + let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}"); + + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 })); + assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 })); + assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 })); + assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 })); + assert_eq!(tokenizer.next(), None); + } +} diff --git a/meilidb/Cargo.toml b/meilidb/Cargo.toml new file mode 100644 index 000000000..c2f4ad0fc --- /dev/null +++ b/meilidb/Cargo.toml @@ -0,0 +1,27 @@ +[package] +edition = "2018" +name = "meilidb" +version = "0.3.1" +authors = ["Kerollmops "] + +[dependencies] +meilidb-core = { path = "../meilidb-core", version = "0.1.0" } +meilidb-data = { path = "../meilidb-data", version = "0.1.0" } +meilidb-tokenizer = { path = "../meilidb-tokenizer", version = "0.1.0" } + +[features] +default = [] +i128 = ["meilidb-core/i128"] +nightly = ["meilidb-core/nightly"] + +[dev-dependencies] +csv = "1.0.7" +env_logger = "0.6.1" +jemallocator = "0.1.9" +quickcheck = "0.8.2" +rand = "0.6.5" +rand_xorshift = "0.1.1" +serde = { version = "1.0.90", features = ["derive"] } +structopt = "0.2.15" +tempfile = "3.0.7" +termcolor = "1.0.4" diff --git a/examples/create-database.rs b/meilidb/examples/create-database.rs similarity index 87% rename from examples/create-database.rs rename to meilidb/examples/create-database.rs index 37e252e1a..b0bfa1127 100644 --- a/examples/create-database.rs +++ b/meilidb/examples/create-database.rs @@ -9,11 +9,10 @@ use std::error::Error; use std::borrow::Cow; use std::fs::File; -use serde_derive::{Serialize, Deserialize}; +use serde::{Serialize, Deserialize}; use structopt::StructOpt; -use meilidb::database::{Database, Schema}; -use meilidb::tokenizer::DefaultBuilder; +use meilidb_data::{Database, Schema}; #[derive(Debug, StructOpt)] pub struct Opt { @@ -51,9 +50,9 @@ fn index( stop_words: &HashSet, ) -> Result> { - let database = Database::create(database_path)?; + let database = Database::start_default(database_path)?; - database.create_index("default", &schema)?; + let index = database.create_index("default".to_string(), schema.clone())?; let mut rdr = csv::Reader::from_path(csv_data_path)?; let mut raw_record = csv::StringRecord::new(); @@ -63,8 +62,7 @@ fn index( let mut end_of_file = false; while !end_of_file { - let tokenizer_builder = DefaultBuilder::new(); - let mut update = database.start_update("default")?; + let mut update = index.documents_addition(); loop { end_of_file = !rdr.read_record(&mut raw_record)?; @@ -78,7 +76,7 @@ fn index( } }; - update.update_document(&document, &tokenizer_builder, &stop_words)?; + update.update_document(&document)?; print!("\rindexing document {}", i); i += 1; @@ -91,7 +89,7 @@ fn index( println!(); println!("committing update..."); - database.commit_update(update)?; + update.finalize()?; } Ok(database) diff --git a/examples/query-database.rs b/meilidb/examples/query-database.rs similarity index 83% rename from examples/query-database.rs rename to meilidb/examples/query-database.rs index ca6733c30..6b048cc5b 100644 --- a/examples/query-database.rs +++ b/meilidb/examples/query-database.rs @@ -2,19 +2,19 @@ static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; use std::collections::btree_map::{BTreeMap, Entry}; +use std::collections::{HashMap, HashSet}; use std::iter::FromIterator; use std::io::{self, Write}; -use std::time::Instant; +use std::time::{Instant, Duration}; use std::path::PathBuf; use std::error::Error; -use hashbrown::{HashMap, HashSet}; use termcolor::{Color, ColorChoice, ColorSpec, StandardStream, WriteColor}; use structopt::StructOpt; +use meilidb_core::Match; -use meilidb::database::schema::SchemaAttr; -use meilidb::database::Database; -use meilidb::Match; +use meilidb_data::schema::SchemaAttr; +use meilidb_data::Database; #[derive(Debug, StructOpt)] pub struct Opt { @@ -138,12 +138,19 @@ fn main() -> Result<(), Box> { let opt = Opt::from_args(); let start = Instant::now(); - let database = Database::open(&opt.database_path)?; - println!("database prepared for you in {:.2?}", start.elapsed()); + let database = Database::start_default(&opt.database_path)?; let mut buffer = String::new(); let input = io::stdin(); + let index = database.open_index("default")?.unwrap(); + let schema = index.schema(); + + println!("database prepared for you in {:.2?}", start.elapsed()); + + let fields = opt.displayed_fields.iter().map(String::as_str); + let fields = HashSet::from_iter(fields); + loop { print!("Searching for: "); io::stdout().flush()?; @@ -151,32 +158,28 @@ fn main() -> Result<(), Box> { if input.read_line(&mut buffer)? == 0 { break } let query = buffer.trim_end_matches('\n'); - let view = database.view("default")?; - let schema = view.schema(); + let start_total = Instant::now(); - let start = Instant::now(); - - let builder = view.query_builder(); + let builder = index.query_builder(); let documents = builder.query(query, 0..opt.number_results); + let mut retrieve_duration = Duration::default(); + let number_of_documents = documents.len(); for mut doc in documents { doc.matches.sort_unstable_by_key(|m| (m.char_index, m.char_index)); - match view.document_by_id::(doc.id) { - Ok(document) => { - for name in &opt.displayed_fields { - let attr = match schema.attribute(name) { - Some(attr) => attr, - None => continue, - }; - let text = match document.get(name) { - Some(text) => text, - None => continue, - }; + let start_retrieve = Instant::now(); + let result = index.document::(Some(&fields), doc.id); + retrieve_duration += start_retrieve.elapsed(); + match result { + Ok(Some(document)) => { + for (name, text) in document { print!("{}: ", name); + + let attr = schema.attribute(&name).unwrap(); let matches = doc.matches.iter() .filter(|m| SchemaAttr::new(m.attribute) == attr) .cloned(); @@ -186,6 +189,7 @@ fn main() -> Result<(), Box> { println!(); } }, + Ok(None) => eprintln!("missing document"), Err(e) => eprintln!("{}", e), } @@ -202,7 +206,8 @@ fn main() -> Result<(), Box> { println!(); } - eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start.elapsed()); + eprintln!("document field retrieve took {:.2?}", retrieve_duration); + eprintln!("===== Found {} results in {:.2?} =====", number_of_documents, start_total.elapsed()); buffer.clear(); } diff --git a/src/common_words.rs b/meilidb/src/common_words.rs similarity index 100% rename from src/common_words.rs rename to meilidb/src/common_words.rs diff --git a/meilidb/src/lib.rs b/meilidb/src/lib.rs new file mode 100644 index 000000000..aba7ab6a7 --- /dev/null +++ b/meilidb/src/lib.rs @@ -0,0 +1,7 @@ +#![cfg_attr(feature = "nightly", feature(test))] + +mod common_words; +mod sort_by_attr; + +pub use self::sort_by_attr::SortByAttr; +pub use self::common_words::CommonWords; diff --git a/src/rank/criterion/sort_by_attr.rs b/meilidb/src/sort_by_attr.rs similarity index 95% rename from src/rank/criterion/sort_by_attr.rs rename to meilidb/src/sort_by_attr.rs index 05033a1e1..f4c4bcc41 100644 --- a/src/rank/criterion/sort_by_attr.rs +++ b/meilidb/src/sort_by_attr.rs @@ -2,10 +2,9 @@ use std::cmp::Ordering; use std::error::Error; use std::fmt; -use crate::database::schema::{Schema, SchemaAttr}; -use crate::rank::criterion::Criterion; -use crate::database::RankedMap; -use crate::rank::RawDocument; +use meilidb_core::criterion::Criterion; +use meilidb_core::RawDocument; +use meilidb_data::{Schema, SchemaAttr, RankedMap}; /// An helper struct that permit to sort documents by /// some of their stored attributes. diff --git a/src/database/config.rs b/src/database/config.rs deleted file mode 100644 index 491cdba93..000000000 --- a/src/database/config.rs +++ /dev/null @@ -1,46 +0,0 @@ -use std::collections::{HashSet, HashMap}; -use serde_derive::{Serialize, Deserialize}; - -#[derive(Debug, Copy, Clone, PartialEq, Eq, Serialize, Deserialize)] -#[serde(rename_all = "lowercase")] -pub enum RankingOrdering { - Asc, - Dsc -} - -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct AccessToken { - pub read_key: String, - pub write_key: String, - pub admin_key: String, -} - - -#[derive(Default, Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct Config { - pub stop_words: Option>, - pub ranking_order: Option>, - pub distinct_field: Option, - pub ranking_rules: Option>, - pub access_token: Option, -} - -impl Config { - pub fn update_with(&mut self, new: Config) { - if let Some(stop_words) = new.stop_words { - self.stop_words = Some(stop_words); - }; - if let Some(ranking_order) = new.ranking_order { - self.ranking_order = Some(ranking_order); - }; - if let Some(distinct_field) = new.distinct_field { - self.distinct_field = Some(distinct_field); - }; - if let Some(ranking_rules) = new.ranking_rules { - self.ranking_rules = Some(ranking_rules); - }; - if let Some(access_token) = new.access_token { - self.access_token = Some(access_token); - }; - } -} diff --git a/src/database/document_key.rs b/src/database/document_key.rs deleted file mode 100644 index 52fd428f8..000000000 --- a/src/database/document_key.rs +++ /dev/null @@ -1,149 +0,0 @@ -use std::io::{Cursor, Read, Write}; -use std::mem::size_of; -use std::fmt; - -use byteorder::{BigEndian, WriteBytesExt, ReadBytesExt}; - -use crate::database::schema::SchemaAttr; -use crate::DocumentId; - -const DOC_KEY_LEN: usize = 4 + size_of::(); -const DOC_KEY_ATTR_LEN: usize = DOC_KEY_LEN + 1 + size_of::(); - -#[derive(Copy, Clone)] -pub struct DocumentKey([u8; DOC_KEY_LEN]); - -impl DocumentKey { - pub fn new(id: DocumentId) -> DocumentKey { - let mut buffer = [0; DOC_KEY_LEN]; - - let mut wtr = Cursor::new(&mut buffer[..]); - wtr.write_all(b"doc-").unwrap(); - wtr.write_u64::(id.0).unwrap(); - - DocumentKey(buffer) - } - - pub fn from_bytes(mut bytes: &[u8]) -> DocumentKey { - assert!(bytes.len() >= DOC_KEY_LEN); - assert_eq!(&bytes[..4], b"doc-"); - - let mut buffer = [0; DOC_KEY_LEN]; - bytes.read_exact(&mut buffer).unwrap(); - - DocumentKey(buffer) - } - - pub fn with_attribute(&self, attr: SchemaAttr) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), attr) - } - - pub fn with_attribute_min(&self) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), SchemaAttr::min()) - } - - pub fn with_attribute_max(&self) -> DocumentKeyAttr { - DocumentKeyAttr::new(self.document_id(), SchemaAttr::max()) - } - - pub fn document_id(&self) -> DocumentId { - let id = (&self.0[4..]).read_u64::().unwrap(); - DocumentId(id) - } -} - -impl AsRef<[u8]> for DocumentKey { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl fmt::Debug for DocumentKey { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("DocumentKey") - .field("document_id", &self.document_id()) - .finish() - } -} - -#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord)] -pub struct DocumentKeyAttr([u8; DOC_KEY_ATTR_LEN]); - -impl DocumentKeyAttr { - pub fn new(id: DocumentId, attr: SchemaAttr) -> DocumentKeyAttr { - let mut buffer = [0; DOC_KEY_ATTR_LEN]; - let DocumentKey(raw_key) = DocumentKey::new(id); - - let mut wtr = Cursor::new(&mut buffer[..]); - wtr.write_all(&raw_key).unwrap(); - wtr.write_all(b"-").unwrap(); - wtr.write_u16::(attr.0).unwrap(); - - DocumentKeyAttr(buffer) - } - - pub fn with_attribute_min(id: DocumentId) -> DocumentKeyAttr { - DocumentKeyAttr::new(id, SchemaAttr::min()) - } - - pub fn with_attribute_max(id: DocumentId) -> DocumentKeyAttr { - DocumentKeyAttr::new(id, SchemaAttr::max()) - } - - pub fn from_bytes(mut bytes: &[u8]) -> DocumentKeyAttr { - assert!(bytes.len() >= DOC_KEY_ATTR_LEN); - assert_eq!(&bytes[..4], b"doc-"); - - let mut buffer = [0; DOC_KEY_ATTR_LEN]; - bytes.read_exact(&mut buffer).unwrap(); - - DocumentKeyAttr(buffer) - } - - pub fn document_id(&self) -> DocumentId { - let id = (&self.0[4..]).read_u64::().unwrap(); - DocumentId(id) - } - - pub fn attribute(&self) -> SchemaAttr { - let offset = 4 + size_of::() + 1; - let value = (&self.0[offset..]).read_u16::().unwrap(); - SchemaAttr::new(value) - } - - pub fn into_document_key(self) -> DocumentKey { - DocumentKey::new(self.document_id()) - } -} - -impl AsRef<[u8]> for DocumentKeyAttr { - fn as_ref(&self) -> &[u8] { - &self.0 - } -} - -impl fmt::Debug for DocumentKeyAttr { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("DocumentKeyAttr") - .field("document_id", &self.document_id()) - .field("attribute", &self.attribute().0) - .finish() - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn keep_as_ref_order() { - for (a, b) in (0..).zip(1..).take(u16::max_value() as usize - 1) { - let id = DocumentId(0); - let a = DocumentKeyAttr::new(id, SchemaAttr(a)); - let b = DocumentKeyAttr::new(id, SchemaAttr(b)); - - assert!(a < b); - assert!(a.as_ref() < b.as_ref()); - } - } -} diff --git a/src/database/mod.rs b/src/database/mod.rs deleted file mode 100644 index 70ca62d92..000000000 --- a/src/database/mod.rs +++ /dev/null @@ -1,911 +0,0 @@ -use std::time::Instant; -use std::error::Error; -use std::ffi::OsStr; -use std::sync::Arc; -use std::fs; -use std::path::{Path, PathBuf}; -use std::sync::atomic::{AtomicBool, Ordering}; -use std::ops::{Deref, DerefMut}; - -use rocksdb::rocksdb_options::{DBOptions, ColumnFamilyOptions}; -use rocksdb::rocksdb::{Writable, Snapshot}; -use rocksdb::{DB, MergeOperands}; -use size_format::SizeFormatterBinary; -use arc_swap::ArcSwap; -use lockfree::map::Map; -use hashbrown::HashMap; -use log::{info, error, warn}; - -use crate::database::schema::SchemaAttr; -use crate::shared_data_cursor::FromSharedDataCursor; -use crate::write_to_bytes::WriteToBytes; -use crate::DocumentId; - -use self::update::{ReadIndexEvent, ReadRankedMapEvent}; - -pub use self::config::Config; -pub use self::document_key::{DocumentKey, DocumentKeyAttr}; -pub use self::view::{DatabaseView, DocumentIter}; -pub use self::update::Update; -pub use self::serde::SerializerError; -pub use self::schema::Schema; -pub use self::index::Index; -pub use self::number::{Number, ParseNumberError}; - -pub type RankedMap = HashMap<(DocumentId, SchemaAttr), Number>; - -const DATA_INDEX: &[u8] = b"data-index"; -const DATA_RANKED_MAP: &[u8] = b"data-ranked-map"; -const DATA_SCHEMA: &[u8] = b"data-schema"; -const CONFIG: &[u8] = b"config"; - -pub mod config; -pub mod schema; -pub(crate) mod index; -mod number; -mod document_key; -mod serde; -mod update; -mod view; - -fn retrieve_data_schema(snapshot: &Snapshot) -> Result> -where D: Deref -{ - match snapshot.get(DATA_SCHEMA)? { - Some(vector) => Ok(Schema::read_from_bin(&*vector)?), - None => Err(String::from("BUG: no schema found in the database").into()), - } -} - -fn retrieve_data_index(snapshot: &Snapshot) -> Result> -where D: Deref -{ - let start = Instant::now(); - let vector = snapshot.get(DATA_INDEX)?; - info!("loading index from kv-store took {:.2?}", start.elapsed()); - - match vector { - Some(vector) => { - let start = Instant::now(); - - let bytes = vector.as_ref().to_vec(); - info!("index size is {}B", SizeFormatterBinary::new(bytes.len() as u64)); - - let event = ReadIndexEvent::from_bytes(bytes)?; - let index = event.updated_documents().expect("BUG: invalid event deserialized"); - - info!("loading index from bytes took {:.2?}", start.elapsed()); - - Ok(index) - }, - None => Ok(Index::default()), - } -} - -fn retrieve_data_ranked_map(snapshot: &Snapshot) -> Result> -where D: Deref, -{ - let start = Instant::now(); - let vector = snapshot.get(DATA_RANKED_MAP)?; - info!("loading ranked map from kv-store took {:.2?}", start.elapsed()); - - match vector { - Some(vector) => { - let start = Instant::now(); - - let bytes = vector.as_ref().to_vec(); - info!("ranked map size is {}B", SizeFormatterBinary::new(bytes.len() as u64)); - - let event = ReadRankedMapEvent::from_bytes(bytes)?; - let ranked_map = event.updated_documents().expect("BUG: invalid event deserialized"); - - info!("loading ranked map from bytes took {:.2?}", start.elapsed()); - - Ok(ranked_map) - }, - None => Ok(RankedMap::new()), - } -} - -fn retrieve_config(snapshot: &Snapshot) -> Result> -where D: Deref, -{ - match snapshot.get(CONFIG)? { - Some(vector) => Ok(bincode::deserialize(&*vector)?), - None => Ok(Config::default()), - } -} - -fn merge_indexes(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - use self::update::ReadIndexEvent::{self, *}; - use self::update::WriteIndexEvent; - - let mut index = Index::default(); - for bytes in existing.into_iter().chain(operands) { - match ReadIndexEvent::from_bytes(bytes.to_vec()).unwrap() { - RemovedDocuments(d) => index = index.remove_documents(d.as_ref()), - UpdatedDocuments(i) => index = index.union(&i), - } - } - - WriteIndexEvent::UpdatedDocuments(&index).into_bytes() -} - -fn merge_ranked_maps(existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - use self::update::ReadRankedMapEvent::{self, *}; - use self::update::WriteRankedMapEvent; - - let mut ranked_map = RankedMap::default(); - for bytes in existing.into_iter().chain(operands) { - match ReadRankedMapEvent::from_bytes(bytes.to_vec()).unwrap() { - RemovedDocuments(d) => ranked_map.retain(|(k, _), _| !d.as_ref().binary_search(k).is_ok()), - UpdatedDocuments(i) => ranked_map.extend(i), - } - } - - WriteRankedMapEvent::UpdatedDocuments(&ranked_map).into_bytes() -} - -fn merge_operator(key: &[u8], existing: Option<&[u8]>, operands: &mut MergeOperands) -> Vec { - match key { - DATA_INDEX => merge_indexes(existing, operands), - DATA_RANKED_MAP => merge_ranked_maps(existing, operands), - key => panic!("The merge operator does not support merging {:?}", key), - } -} - -pub struct IndexUpdate { - index: String, - update: Update, -} - -impl Deref for IndexUpdate { - type Target = Update; - - fn deref(&self) -> &Update { - &self.update - } -} - -impl DerefMut for IndexUpdate { - fn deref_mut(&mut self) -> &mut Update { - &mut self.update - } -} - -struct DatabaseIndex { - db: Arc, - - // This view is updated each time the DB ingests an update. - view: ArcSwap>>, - - // The path of the mdb folder stored on disk. - path: PathBuf, - - // must_die false by default, must be set as true when the Index is dropped. - // It is used to erase the folder saved on disk when the user request to delete an index. - must_die: AtomicBool, -} - -impl DatabaseIndex { - fn create>(path: P, schema: &Schema) -> Result> { - let path = path.as_ref(); - if path.exists() { - return Err(format!("File already exists at path: {}, cannot create database.", - path.display()).into()) - } - - let path_lossy = path.to_string_lossy(); - let mut opts = DBOptions::new(); - opts.create_if_missing(true); - // opts.error_if_exists(true); // FIXME pull request that - - let mut cf_opts = ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data merge operator", merge_operator); - - let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; - - let mut schema_bytes = Vec::new(); - schema.write_to_bin(&mut schema_bytes)?; - db.put(DATA_SCHEMA, &schema_bytes)?; - - let db = Arc::new(db); - let snapshot = Snapshot::new(db.clone()); - let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?)); - - Ok(DatabaseIndex { - db: db, - view: view, - path: path.to_path_buf(), - must_die: AtomicBool::new(false) - }) - } - - fn open>(path: P) -> Result> { - let path_lossy = path.as_ref().to_string_lossy(); - - let mut opts = DBOptions::new(); - opts.create_if_missing(false); - - let mut cf_opts = ColumnFamilyOptions::new(); - cf_opts.add_merge_operator("data merge operator", merge_operator); - - let db = DB::open_cf(opts, &path_lossy, vec![("default", cf_opts)])?; - - // FIXME create a generic function to do that ! - let _schema = match db.get(DATA_SCHEMA)? { - Some(value) => Schema::read_from_bin(&*value)?, - None => return Err(String::from("Database does not contain a schema").into()), - }; - - let db = Arc::new(db); - let snapshot = Snapshot::new(db.clone()); - let view = ArcSwap::new(Arc::new(DatabaseView::new(snapshot)?)); - - Ok(DatabaseIndex { - db: db, - view: view, - path: path.as_ref().to_path_buf(), - must_die: AtomicBool::new(false) - }) - } - - fn must_die(&self) { - self.must_die.store(true, Ordering::Relaxed) - } - - fn start_update(&self) -> Result> { - let schema = match self.db.get(DATA_SCHEMA)? { - Some(value) => Schema::read_from_bin(&*value)?, - None => panic!("Database does not contain a schema"), - }; - - Ok(Update::new(schema)) - } - - fn commit_update(&self, update: Update) -> Result>>, Box> { - let batch = update.build()?; - self.db.write(batch)?; - self.db.compact_range(None, None); - self.db.flush(true)?; - - let snapshot = Snapshot::new(self.db.clone()); - let view = Arc::new(DatabaseView::new(snapshot)?); - self.view.store(view.clone()); - - Ok(view) - } - - fn view(&self) -> Arc>> { - self.view.load() - } - - fn get_config(&self) -> Config { - self.view().config().clone() - } - - fn update_config(&self, config: Config) -> Result>>, Box>{ - let data = bincode::serialize(&config)?; - self.db.put(CONFIG, &data)?; - - let snapshot = Snapshot::new(self.db.clone()); - let view = Arc::new(DatabaseView::new(snapshot)?); - self.view.store(view.clone()); - - Ok(view) - } - - fn path(&self) -> &Path { - self.path.as_path() - } -} - -impl Drop for DatabaseIndex { - fn drop(&mut self) { - if self.must_die.load(Ordering::Relaxed) { - if let Err(err) = fs::remove_dir_all(&self.path) { - error!("Impossible to remove mdb when Database is dropped; {}", err); - } - } - } -} - -pub struct Database { - indexes: Map>, - path: PathBuf, -} - -impl Database { - pub fn create>(path: P) -> Result> { - Ok(Database { - indexes: Map::new(), - path: path.as_ref().to_path_buf(), - }) - } - - pub fn open>(path: P) -> Result> { - let entries = fs::read_dir(&path)?; - - let indexes = Map::new(); - for entry in entries { - let path = match entry { - Ok(p) => p.path(), - Err(err) => { - warn!("Impossible to retrieve the path from an entry; {}", err); - continue - } - }; - - let name = match path.file_stem().and_then(OsStr::to_str) { - Some(name) => name.to_owned(), - None => continue - }; - - let db = match DatabaseIndex::open(path.clone()) { - Ok(db) => db, - Err(err) => { - warn!("Impossible to open the database; {}", err); - continue - } - }; - - info!("Load database {}", name); - indexes.insert(name, Arc::new(db)); - } - - Ok(Database { - indexes: indexes, - path: path.as_ref().to_path_buf(), - }) - } - - pub fn create_index(&self, name: &str, schema: &Schema) -> Result<(), Box> { - let index_path = self.path.join(name); - - if index_path.exists() { - return Err("Index already exists".into()); - } - - let index = DatabaseIndex::create(index_path, schema)?; - self.indexes.insert(name.to_owned(), Arc::new(index)); - - Ok(()) - } - - pub fn delete_index(&self, name: &str) -> Result<(), Box> { - let index_guard = self.indexes.remove(name).ok_or("Index not found")?; - index_guard.val().must_die(); - - Ok(()) - } - - pub fn list_indexes(&self) -> Vec { - self.indexes.iter().map(|g| g.key().clone()).collect() - } - - pub fn start_update(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - let update = index_guard.val().start_update()?; - - Ok(IndexUpdate { index: index.to_owned(), update }) - } - - pub fn commit_update(&self, update: IndexUpdate)-> Result>>, Box> { - let index_guard = self.indexes.get(&update.index).ok_or("Index not found")?; - - index_guard.val().commit_update(update.update) - } - - pub fn view(&self, index: &str) -> Result>>, Box> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().view()) - } - - pub fn get_config(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().get_config()) - } - - pub fn update_config(&self, index: &str, config: Config) -> Result>>, Box>{ - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - - Ok(index_guard.val().update_config(config)?) - } - - pub fn path(&self) -> &Path { - self.path.as_path() - } - - pub fn index_path(&self, index: &str) -> Result> { - let index_guard = self.indexes.get(index).ok_or("Index not found")?; - let path = index_guard.val().path(); - Ok(path.to_path_buf()) - } - -} - -#[cfg(test)] -mod tests { - use std::collections::HashSet; - use std::error::Error; - - use serde_derive::{Serialize, Deserialize}; - - use crate::database::schema::{SchemaBuilder, STORED, INDEXED}; - use crate::tokenizer::DefaultBuilder; - - use super::*; - - #[test] - fn ingest_one_easy_update() -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let meilidb_path = dir.path().join("meilidb.mdb"); - let meilidb_index_name = "default"; - - #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] - struct SimpleDoc { - id: u64, - title: String, - description: String, - timestamp: u64, - } - - let schema = { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("id", STORED); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - builder.new_attribute("timestamp", STORED); - builder.build() - }; - - let database = Database::create(&meilidb_path)?; - - database.create_index(meilidb_index_name, &schema)?; - - let doc0 = SimpleDoc { - id: 0, - title: String::from("I am a title"), - description: String::from("I am a description"), - timestamp: 1234567, - }; - let doc1 = SimpleDoc { - id: 1, - title: String::from("I am the second title"), - description: String::from("I am the second description"), - timestamp: 7654321, - }; - - let tokenizer_builder = DefaultBuilder::new(); - let mut builder = database.start_update(meilidb_index_name)?; - - let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; - let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; - - let view = database.commit_update(builder)?; - - let de_doc0: SimpleDoc = view.document_by_id(docid0)?; - let de_doc1: SimpleDoc = view.document_by_id(docid1)?; - - assert_eq!(doc0, de_doc0); - assert_eq!(doc1, de_doc1); - - Ok(dir.close()?) - } - - #[test] - fn ingest_two_easy_updates() -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let meilidb_path = dir.path().join("meilidb.mdb"); - let meilidb_index_name = "default"; - - #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] - struct SimpleDoc { - id: u64, - title: String, - description: String, - timestamp: u64, - } - - let schema = { - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("id", STORED); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - builder.new_attribute("timestamp", STORED); - builder.build() - }; - - let database = Database::create(&meilidb_path)?; - - database.create_index(meilidb_index_name, &schema)?; - - let doc0 = SimpleDoc { - id: 0, - title: String::from("I am a title"), - description: String::from("I am a description"), - timestamp: 1234567, - }; - let doc1 = SimpleDoc { - id: 1, - title: String::from("I am the second title"), - description: String::from("I am the second description"), - timestamp: 7654321, - }; - let doc2 = SimpleDoc { - id: 2, - title: String::from("I am the third title"), - description: String::from("I am the third description"), - timestamp: 7654321, - }; - let doc3 = SimpleDoc { - id: 3, - title: String::from("I am the fourth title"), - description: String::from("I am the fourth description"), - timestamp: 7654321, - }; - - let tokenizer_builder = DefaultBuilder::new(); - - let mut builder = database.start_update(meilidb_index_name)?; - let docid0 = builder.update_document(&doc0, &tokenizer_builder, &stop_words)?; - let docid1 = builder.update_document(&doc1, &tokenizer_builder, &stop_words)?; - database.commit_update(builder)?; - - let mut builder = database.start_update(meilidb_index_name)?; - let docid2 = builder.update_document(&doc2, &tokenizer_builder, &stop_words)?; - let docid3 = builder.update_document(&doc3, &tokenizer_builder, &stop_words)?; - let view = database.commit_update(builder)?; - - let de_doc0: SimpleDoc = view.document_by_id(docid0)?; - let de_doc1: SimpleDoc = view.document_by_id(docid1)?; - - assert_eq!(doc0, de_doc0); - assert_eq!(doc1, de_doc1); - - let de_doc2: SimpleDoc = view.document_by_id(docid2)?; - let de_doc3: SimpleDoc = view.document_by_id(docid3)?; - - assert_eq!(doc2, de_doc2); - assert_eq!(doc3, de_doc3); - - Ok(dir.close()?) - } -} - -#[cfg(all(feature = "nightly", test))] -mod bench { - extern crate test; - - use std::collections::HashSet; - use std::error::Error; - use std::iter::repeat_with; - use self::test::Bencher; - - use rand::distributions::Alphanumeric; - use rand_xorshift::XorShiftRng; - use rand::{Rng, SeedableRng}; - use serde_derive::Serialize; - use rand::seq::SliceRandom; - - use crate::tokenizer::DefaultBuilder; - use crate::database::schema::*; - - use super::*; - - fn random_sentences(number: usize, rng: &mut R) -> String { - let mut words = String::new(); - - for i in 0..number { - let word_len = rng.gen_range(1, 12); - let iter = repeat_with(|| rng.sample(Alphanumeric)).take(word_len); - words.extend(iter); - - if i == number - 1 { // last word - let final_ = [".", "?", "!", "..."].choose(rng).cloned(); - words.extend(final_); - } else { - let middle = [",", ", "].choose(rng).cloned(); - words.extend(middle); - } - } - - words - } - - #[bench] - fn open_little_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let tokenizer_builder = DefaultBuilder; - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..300 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - fn open_medium_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let tokenizer_builder = DefaultBuilder; - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..3000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - #[ignore] - fn open_big_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let tokenizer_builder = DefaultBuilder; - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..30_000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; - } - - database.commit_update(builder)?; - - drop(database); - - bench.iter(|| { - let database = Database::open(db_path.clone()).unwrap(); - test::black_box(|| database); - }); - - Ok(()) - } - - #[bench] - fn search_oneletter_little_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let tokenizer_builder = DefaultBuilder; - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..300 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } - - #[bench] - fn search_oneletter_medium_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let tokenizer_builder = DefaultBuilder; - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..3000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } - - #[bench] - #[ignore] - fn search_oneletter_big_database(bench: &mut Bencher) -> Result<(), Box> { - let dir = tempfile::tempdir()?; - let stop_words = HashSet::new(); - - let mut builder = SchemaBuilder::with_identifier("id"); - builder.new_attribute("title", STORED | INDEXED); - builder.new_attribute("description", STORED | INDEXED); - let schema = builder.build(); - - let db_path = dir.path().join("bench.mdb"); - let index_name = "default"; - - let database = Database::create(&db_path)?; - database.create_index(index_name, &schema)?; - - #[derive(Serialize)] - struct Document { - id: u64, - title: String, - description: String, - } - - let tokenizer_builder = DefaultBuilder; - let mut builder = database.start_update(index_name)?; - let mut rng = XorShiftRng::seed_from_u64(42); - - for i in 0..30_000 { - let document = Document { - id: i, - title: random_sentences(rng.gen_range(1, 8), &mut rng), - description: random_sentences(rng.gen_range(20, 200), &mut rng), - }; - builder.update_document(&document, &tokenizer_builder, &stop_words)?; - } - - let view = database.commit_update(builder)?; - - bench.iter(|| { - for q in &["a", "b", "c", "d", "e"] { - let documents = view.query_builder().query(q, 0..20); - test::black_box(|| documents); - } - }); - - Ok(()) - } -} diff --git a/src/database/number.rs b/src/database/number.rs deleted file mode 100644 index b2c4c9a88..000000000 --- a/src/database/number.rs +++ /dev/null @@ -1,98 +0,0 @@ -use std::cmp::Ordering; -use std::str::FromStr; -use std::fmt; - -use serde_derive::{Serialize, Deserialize}; - -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone)] -pub enum Number { - Unsigned(u64), - Signed(i64), - Float(f64), -} - -impl FromStr for Number { - type Err = ParseNumberError; - - fn from_str(s: &str) -> Result { - if let Ok(unsigned) = u64::from_str(s) { - return Ok(Number::Unsigned(unsigned)) - } - - if let Ok(signed) = i64::from_str(s) { - return Ok(Number::Signed(signed)) - } - - if let Ok(float) = f64::from_str(s) { - if float == 0.0 || float.is_normal() { - return Ok(Number::Float(float)) - } - } - - Err(ParseNumberError) - } -} - -impl PartialOrd for Number { - fn partial_cmp(&self, other: &Number) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for Number { - fn cmp(&self, other: &Number) -> Ordering { - use Number::*; - match (self, other) { - (Unsigned(s), Unsigned(o)) => s.cmp(o), - (Unsigned(s), Signed(o)) => { - let s = i128::from(*s); - let o = i128::from(*o); - s.cmp(&o) - }, - (Unsigned(s), Float(o)) => { - let s = *s as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - - (Signed(s), Unsigned(o)) => { - let s = i128::from(*s); - let o = i128::from(*o); - s.cmp(&o) - }, - (Signed(s), Signed(o)) => s.cmp(o), - (Signed(s), Float(o)) => { - let s = *s as f64; - s.partial_cmp(o).unwrap_or(Ordering::Equal) - }, - - (Float(s), Unsigned(o)) => { - let o = *o as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - (Float(s), Signed(o)) => { - let o = *o as f64; - s.partial_cmp(&o).unwrap_or(Ordering::Equal) - }, - (Float(s), Float(o)) => { - s.partial_cmp(o).unwrap_or(Ordering::Equal) - }, - } - } -} - -impl PartialEq for Number { - fn eq(&self, other: &Number) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for Number { } - -pub struct ParseNumberError; - -impl fmt::Display for ParseNumberError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.write_str("can not parse number") - } -} diff --git a/src/database/serde/deserializer.rs b/src/database/serde/deserializer.rs deleted file mode 100644 index 26d74984d..000000000 --- a/src/database/serde/deserializer.rs +++ /dev/null @@ -1,186 +0,0 @@ -use std::error::Error; -use std::ops::Deref; -use std::fmt; - -use rocksdb::rocksdb::{DB, Snapshot, SeekKey}; -use rocksdb::rocksdb_options::ReadOptions; -use serde::forward_to_deserialize_any; -use serde::de::value::MapDeserializer; -use serde::de::{self, Visitor, IntoDeserializer}; - -use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -use crate::database::schema::Schema; -use crate::DocumentId; - -pub struct Deserializer<'a, D> -where D: Deref -{ - snapshot: &'a Snapshot, - schema: &'a Schema, - document_id: DocumentId, -} - -impl<'a, D> Deserializer<'a, D> -where D: Deref -{ - pub fn new(snapshot: &'a Snapshot, schema: &'a Schema, doc: DocumentId) -> Self { - Deserializer { snapshot, schema, document_id: doc } - } -} - -impl<'de, 'a, 'b, D> de::Deserializer<'de> for &'b mut Deserializer<'a, D> -where D: Deref -{ - type Error = DeserializerError; - - fn deserialize_any(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_map(visitor) - } - - forward_to_deserialize_any! { - bool u8 u16 u32 u64 i8 i16 i32 i64 f32 f64 char str string unit seq - bytes byte_buf unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct - } - - fn deserialize_map(self, visitor: V) -> Result - where V: Visitor<'de> - { - let mut options = ReadOptions::new(); - let lower = DocumentKey::new(self.document_id); - let upper = lower.with_attribute_max(); - options.set_iterate_lower_bound(lower.as_ref()); - options.set_iterate_upper_bound(upper.as_ref()); - - let mut iter = self.snapshot.iter_opt(options); - iter.seek(SeekKey::Start); - - if iter.kv().is_none() { - // FIXME return an error - } - - let iter = iter.map(|(key, value)| { - // retrieve the schema attribute name - // from the schema attribute number - let document_key_attr = DocumentKeyAttr::from_bytes(&key); - let schema_attr = document_key_attr.attribute(); - let attribute_name = self.schema.attribute_name(schema_attr); - (attribute_name, Value(value)) - }); - - let map_deserializer = MapDeserializer::new(iter); - visitor.visit_map(map_deserializer) - } -} - -struct Value(Vec); - -impl<'de> IntoDeserializer<'de, DeserializerError> for Value { - type Deserializer = Self; - - fn into_deserializer(self) -> Self::Deserializer { - self - } -} - -macro_rules! forward_to_bincode_values { - ($($ty:ident => $de_method:ident,)*) => { - $( - fn $de_method(self, visitor: V) -> Result - where V: de::Visitor<'de> - { - match bincode::deserialize::<$ty>(&self.0) { - Ok(val) => val.into_deserializer().$de_method(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - )* - } -} - -impl<'de, 'a> de::Deserializer<'de> for Value { - type Error = DeserializerError; - - fn deserialize_any(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.0.into_deserializer().deserialize_any(visitor) - } - - fn deserialize_str(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_string(visitor) - } - - fn deserialize_string(self, visitor: V) -> Result - where V: Visitor<'de> - { - match bincode::deserialize::(&self.0) { - Ok(val) => val.into_deserializer().deserialize_string(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - - fn deserialize_bytes(self, visitor: V) -> Result - where V: Visitor<'de> - { - self.deserialize_byte_buf(visitor) - } - - fn deserialize_byte_buf(self, visitor: V) -> Result - where V: Visitor<'de> - { - match bincode::deserialize::>(&self.0) { - Ok(val) => val.into_deserializer().deserialize_byte_buf(visitor), - Err(e) => Err(de::Error::custom(e)), - } - } - - forward_to_bincode_values! { - char => deserialize_char, - bool => deserialize_bool, - - u8 => deserialize_u8, - u16 => deserialize_u16, - u32 => deserialize_u32, - u64 => deserialize_u64, - - i8 => deserialize_i8, - i16 => deserialize_i16, - i32 => deserialize_i32, - i64 => deserialize_i64, - - f32 => deserialize_f32, - f64 => deserialize_f64, - } - - forward_to_deserialize_any! { - unit seq map - unit_struct tuple_struct - identifier tuple ignored_any option newtype_struct enum struct - } -} - -#[derive(Debug)] -pub enum DeserializerError { - Custom(String), -} - -impl de::Error for DeserializerError { - fn custom(msg: T) -> Self { - DeserializerError::Custom(msg.to_string()) - } -} - -impl fmt::Display for DeserializerError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - DeserializerError::Custom(s) => f.write_str(&s), - } - } -} - -impl Error for DeserializerError {} diff --git a/src/database/serde/indexer_serializer.rs b/src/database/serde/indexer_serializer.rs deleted file mode 100644 index c25ffe98c..000000000 --- a/src/database/serde/indexer_serializer.rs +++ /dev/null @@ -1,194 +0,0 @@ -use std::collections::HashSet; - -use serde::Serialize; -use serde::ser; - -use crate::database::update::DocumentUpdate; -use crate::database::serde::SerializerError; -use crate::database::schema::SchemaAttr; -use crate::tokenizer::TokenizerBuilder; -use crate::tokenizer::Token; -use crate::{is_cjk, DocumentId, DocIndex}; - -pub struct IndexerSerializer<'a, 'b, B> { - pub tokenizer_builder: &'a B, - pub update: &'a mut DocumentUpdate<'b>, - pub document_id: DocumentId, - pub attribute: SchemaAttr, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b, B> ser::Serializer for IndexerSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = ser::Impossible; - type SerializeStruct = ser::Impossible; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, v: &str) -> Result { - for token in self.tokenizer_builder.build(v) { - let Token { word, word_index, char_index } = token; - let document_id = self.document_id; - - // FIXME must u32::try_from instead - let attribute = self.attribute.0; - let word_index = word_index as u16; - - // insert the exact representation - let word_lower = word.to_lowercase(); - let length = word.chars().count() as u16; - - if self.stop_words.contains(&word_lower) { continue } - - // and the unidecoded lowercased version - if !word_lower.chars().any(is_cjk) { - let word_unidecoded = unidecode::unidecode(word).to_lowercase(); - let word_unidecoded = word_unidecoded.trim(); - if word_lower != word_unidecoded { - let char_index = char_index as u16; - let char_length = length; - - let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; - self.update.insert_doc_index(word_unidecoded.as_bytes().to_vec(), doc_index)?; - } - } - - let char_index = char_index as u16; - let char_length = length; - - let doc_index = DocIndex { document_id, attribute, word_index, char_index, char_length }; - self.update.insert_doc_index(word_lower.into_bytes(), doc_index)?; - } - Ok(()) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "seq" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "map" }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct" }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} diff --git a/src/database/serde/mod.rs b/src/database/serde/mod.rs deleted file mode 100644 index 493124f7e..000000000 --- a/src/database/serde/mod.rs +++ /dev/null @@ -1,65 +0,0 @@ -use std::collections::hash_map::DefaultHasher; -use std::hash::{Hash, Hasher}; -use std::error::Error; -use std::fmt; - -use serde::ser; - -macro_rules! forward_to_unserializable_type { - ($($ty:ident => $se_method:ident,)*) => { - $( - fn $se_method(self, _v: $ty) -> Result { - Err(SerializerError::UnserializableType { name: "$ty" }) - } - )* - } -} - -pub mod find_id; -pub mod key_to_string; -pub mod value_to_number; -pub mod serializer; -pub mod indexer_serializer; -pub mod deserializer; - -pub fn calculate_hash(t: &T) -> u64 { - let mut s = DefaultHasher::new(); - t.hash(&mut s); - s.finish() -} - -#[derive(Debug)] -pub enum SerializerError { - DocumentIdNotFound, - UnserializableType { name: &'static str }, - Custom(String), -} - -impl ser::Error for SerializerError { - fn custom(msg: T) -> Self { - SerializerError::Custom(msg.to_string()) - } -} - -impl fmt::Display for SerializerError { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { - SerializerError::DocumentIdNotFound => { - write!(f, "serialized document does not have an id according to the schema") - } - SerializerError::UnserializableType { name } => { - write!(f, "Only struct and map types are considered valid documents and - can be serialized, not {} types directly.", name) - }, - SerializerError::Custom(s) => f.write_str(&s), - } - } -} - -impl Error for SerializerError {} - -impl From for SerializerError { - fn from(value: String) -> SerializerError { - SerializerError::Custom(value) - } -} diff --git a/src/database/serde/serializer.rs b/src/database/serde/serializer.rs deleted file mode 100644 index 2f41bb82c..000000000 --- a/src/database/serde/serializer.rs +++ /dev/null @@ -1,296 +0,0 @@ -use std::collections::HashSet; - -use serde::Serialize; -use serde::ser; - -use crate::database::serde::indexer_serializer::IndexerSerializer; -use crate::database::serde::key_to_string::KeyToStringSerializer; -use crate::database::serde::value_to_number::ValueToNumberSerializer; -use crate::database::update::DocumentUpdate; -use crate::database::serde::SerializerError; -use crate::tokenizer::TokenizerBuilder; -use crate::database::schema::Schema; -use crate::DocumentId; - -pub struct Serializer<'a, 'b, B> { - pub schema: &'a Schema, - pub update: &'a mut DocumentUpdate<'b>, - pub document_id: DocumentId, - pub tokenizer_builder: &'a B, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b, B> ser::Serializer for Serializer<'a, 'b, B> -where B: TokenizerBuilder -{ - type Ok = (); - type Error = SerializerError; - type SerializeSeq = ser::Impossible; - type SerializeTuple = ser::Impossible; - type SerializeTupleStruct = ser::Impossible; - type SerializeTupleVariant = ser::Impossible; - type SerializeMap = MapSerializer<'a, 'b, B>; - type SerializeStruct = StructSerializer<'a, 'b, B>; - type SerializeStructVariant = ser::Impossible; - - forward_to_unserializable_type! { - bool => serialize_bool, - char => serialize_char, - - i8 => serialize_i8, - i16 => serialize_i16, - i32 => serialize_i32, - i64 => serialize_i64, - - u8 => serialize_u8, - u16 => serialize_u16, - u32 => serialize_u32, - u64 => serialize_u64, - - f32 => serialize_f32, - f64 => serialize_f64, - } - - fn serialize_str(self, _v: &str) -> Result { - Err(SerializerError::UnserializableType { name: "str" }) - } - - fn serialize_bytes(self, _v: &[u8]) -> Result { - Err(SerializerError::UnserializableType { name: "&[u8]" }) - } - - fn serialize_none(self) -> Result { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_some(self, _value: &T) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "Option" }) - } - - fn serialize_unit(self) -> Result { - Err(SerializerError::UnserializableType { name: "()" }) - } - - fn serialize_unit_struct(self, _name: &'static str) -> Result { - Err(SerializerError::UnserializableType { name: "unit struct" }) - } - - fn serialize_unit_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str - ) -> Result - { - Err(SerializerError::UnserializableType { name: "unit variant" }) - } - - fn serialize_newtype_struct( - self, - _name: &'static str, - value: &T - ) -> Result - where T: Serialize, - { - value.serialize(self) - } - - fn serialize_newtype_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _value: &T - ) -> Result - where T: Serialize, - { - Err(SerializerError::UnserializableType { name: "newtype variant" }) - } - - fn serialize_seq(self, _len: Option) -> Result { - Err(SerializerError::UnserializableType { name: "sequence" }) - } - - fn serialize_tuple(self, _len: usize) -> Result { - Err(SerializerError::UnserializableType { name: "tuple" }) - } - - fn serialize_tuple_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple struct" }) - } - - fn serialize_tuple_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "tuple variant" }) - } - - fn serialize_map(self, _len: Option) -> Result { - Ok(MapSerializer { - schema: self.schema, - document_id: self.document_id, - update: self.update, - tokenizer_builder: self.tokenizer_builder, - stop_words: self.stop_words, - current_key_name: None, - }) - } - - fn serialize_struct( - self, - _name: &'static str, - _len: usize - ) -> Result - { - Ok(StructSerializer { - schema: self.schema, - document_id: self.document_id, - update: self.update, - tokenizer_builder: self.tokenizer_builder, - stop_words: self.stop_words, - }) - } - - fn serialize_struct_variant( - self, - _name: &'static str, - _variant_index: u32, - _variant: &'static str, - _len: usize - ) -> Result - { - Err(SerializerError::UnserializableType { name: "struct variant" }) - } -} - -pub struct MapSerializer<'a, 'b, B> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub update: &'a mut DocumentUpdate<'b>, - pub tokenizer_builder: &'a B, - pub stop_words: &'a HashSet, - pub current_key_name: Option, -} - -impl<'a, 'b, B> ser::SerializeMap for MapSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ - type Ok = (); - type Error = SerializerError; - - fn serialize_key(&mut self, key: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - self.current_key_name = Some(key); - Ok(()) - } - - fn serialize_value(&mut self, value: &T) -> Result<(), Self::Error> - where T: Serialize, - { - let key = self.current_key_name.take().unwrap(); - self.serialize_entry(&key, value) - } - - fn serialize_entry( - &mut self, - key: &K, - value: &V, - ) -> Result<(), Self::Error> - where K: Serialize, V: Serialize, - { - let key = key.serialize(KeyToStringSerializer)?; - - if let Some(attr) = self.schema.attribute(key) { - let props = self.schema.props(attr); - if props.is_stored() { - let value = bincode::serialize(value).unwrap(); - self.update.insert_attribute_value(attr, &value)?; - } - if props.is_indexed() { - let serializer = IndexerSerializer { - update: self.update, - tokenizer_builder: self.tokenizer_builder, - document_id: self.document_id, - attribute: attr, - stop_words: self.stop_words, - }; - value.serialize(serializer)?; - } - if props.is_ranked() { - let number = value.serialize(ValueToNumberSerializer)?; - self.update.register_ranked_attribute(attr, number)?; - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(()) - } -} - -pub struct StructSerializer<'a, 'b, B> { - pub schema: &'a Schema, - pub document_id: DocumentId, - pub update: &'a mut DocumentUpdate<'b>, - pub tokenizer_builder: &'a B, - pub stop_words: &'a HashSet, -} - -impl<'a, 'b, B> ser::SerializeStruct for StructSerializer<'a, 'b, B> -where B: TokenizerBuilder -{ - type Ok = (); - type Error = SerializerError; - - fn serialize_field( - &mut self, - key: &'static str, - value: &T - ) -> Result<(), Self::Error> - where T: Serialize, - { - if let Some(attr) = self.schema.attribute(key) { - let props = self.schema.props(attr); - if props.is_stored() { - let value = bincode::serialize(value).unwrap(); - self.update.insert_attribute_value(attr, &value)?; - } - if props.is_indexed() { - let serializer = IndexerSerializer { - update: self.update, - tokenizer_builder: self.tokenizer_builder, - document_id: self.document_id, - attribute: attr, - stop_words: self.stop_words, - }; - value.serialize(serializer)?; - } - if props.is_ranked() { - let integer = value.serialize(ValueToNumberSerializer)?; - self.update.register_ranked_attribute(attr, integer)?; - } - } - - Ok(()) - } - - fn end(self) -> Result { - Ok(()) - } -} diff --git a/src/database/update/index_event.rs b/src/database/update/index_event.rs deleted file mode 100644 index cd006aa3c..000000000 --- a/src/database/update/index_event.rs +++ /dev/null @@ -1,55 +0,0 @@ -use std::error::Error; - -use byteorder::{ReadBytesExt, WriteBytesExt}; - -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; -use crate::database::Index; -use crate::data::DocIds; - -pub enum WriteIndexEvent<'a> { - RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a Index), -} - -impl<'a> WriteToBytes for WriteIndexEvent<'a> { - fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - WriteIndexEvent::RemovedDocuments(doc_ids) => { - let _ = bytes.write_u8(0); - doc_ids.write_to_bytes(bytes); - }, - WriteIndexEvent::UpdatedDocuments(index) => { - let _ = bytes.write_u8(1); - index.write_to_bytes(bytes); - } - } - } -} - -pub enum ReadIndexEvent { - RemovedDocuments(DocIds), - UpdatedDocuments(Index), -} - -impl ReadIndexEvent { - pub fn updated_documents(self) -> Option { - use ReadIndexEvent::*; - match self { - RemovedDocuments(_) => None, - UpdatedDocuments(index) => Some(index), - } - } -} - -impl FromSharedDataCursor for ReadIndexEvent { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - match cursor.read_u8()? { - 0 => DocIds::from_shared_data_cursor(cursor).map(ReadIndexEvent::RemovedDocuments), - 1 => Index::from_shared_data_cursor(cursor).map(ReadIndexEvent::UpdatedDocuments), - _ => unreachable!(), - } - } -} diff --git a/src/database/update/mod.rs b/src/database/update/mod.rs deleted file mode 100644 index 548fb8bc2..000000000 --- a/src/database/update/mod.rs +++ /dev/null @@ -1,239 +0,0 @@ -use std::collections::{HashSet, BTreeMap}; -use std::error::Error; - -use rocksdb::rocksdb::{Writable, WriteBatch}; -use hashbrown::hash_map::HashMap; -use sdset::{Set, SetBuf}; -use serde::Serialize; - -use crate::database::document_key::{DocumentKey, DocumentKeyAttr}; -use crate::database::serde::serializer::Serializer; -use crate::database::serde::SerializerError; -use crate::database::schema::SchemaAttr; -use crate::database::schema::Schema; -use crate::database::index::IndexBuilder; -use crate::database::{DATA_INDEX, DATA_RANKED_MAP}; -use crate::database::{RankedMap, Number}; -use crate::tokenizer::TokenizerBuilder; -use crate::write_to_bytes::WriteToBytes; -use crate::data::DocIds; -use crate::{DocumentId, DocIndex}; - -pub use self::index_event::{ReadIndexEvent, WriteIndexEvent}; -pub use self::ranked_map_event::{ReadRankedMapEvent, WriteRankedMapEvent}; - -mod index_event; -mod ranked_map_event; - -pub type Token = Vec; // TODO could be replaced by a SmallVec - -pub struct Update { - schema: Schema, - raw_builder: RawUpdateBuilder, -} - -impl Update { - pub(crate) fn new(schema: Schema) -> Update { - Update { schema, raw_builder: RawUpdateBuilder::new() } - } - - pub fn update_document( - &mut self, - document: T, - tokenizer_builder: &B, - stop_words: &HashSet, - ) -> Result - where T: Serialize, - B: TokenizerBuilder, - { - let document_id = self.schema.document_id(&document)?; - - let serializer = Serializer { - schema: &self.schema, - document_id: document_id, - tokenizer_builder: tokenizer_builder, - update: &mut self.raw_builder.document_update(document_id)?, - stop_words: stop_words, - }; - - document.serialize(serializer)?; - - Ok(document_id) - } - - pub fn remove_document(&mut self, document: T) -> Result - where T: Serialize, - { - let document_id = self.schema.document_id(&document)?; - self.raw_builder.document_update(document_id)?.remove()?; - Ok(document_id) - } - - pub(crate) fn build(self) -> Result> { - self.raw_builder.build() - } -} - -#[derive(Copy, Clone, PartialEq, Eq)] -enum UpdateType { - Updated, - Deleted, -} - -use UpdateType::{Updated, Deleted}; - -pub struct RawUpdateBuilder { - documents_update: HashMap, - documents_ranked_fields: RankedMap, - indexed_words: BTreeMap>, - batch: WriteBatch, -} - -impl RawUpdateBuilder { - pub fn new() -> RawUpdateBuilder { - RawUpdateBuilder { - documents_update: HashMap::new(), - documents_ranked_fields: HashMap::new(), - indexed_words: BTreeMap::new(), - batch: WriteBatch::new(), - } - } - - pub fn document_update(&mut self, document_id: DocumentId) -> Result { - use serde::ser::Error; - - match self.documents_update.get(&document_id) { - Some(Deleted) | None => Ok(DocumentUpdate { document_id, inner: self }), - Some(Updated) => Err(SerializerError::custom( - "This document has already been removed and cannot be updated in the same update" - )), - } - } - - pub fn build(self) -> Result> { - // create the list of all the removed documents - let removed_documents = { - let mut document_ids = Vec::new(); - for (id, update_type) in self.documents_update { - if update_type == Deleted { - document_ids.push(id); - } - } - - document_ids.sort_unstable(); - let setbuf = SetBuf::new_unchecked(document_ids); - DocIds::new(&setbuf) - }; - - // create the Index of all the document updates - let index = { - let mut builder = IndexBuilder::new(); - for (key, mut indexes) in self.indexed_words { - indexes.sort_unstable(); - let indexes = Set::new_unchecked(&indexes); - builder.insert(key, indexes).unwrap(); - } - builder.build() - }; - - // WARN: removed documents must absolutely - // be merged *before* document updates - - // === index === - - if !removed_documents.is_empty() { - // remove the documents using the appropriate IndexEvent - let event_bytes = WriteIndexEvent::RemovedDocuments(&removed_documents).into_bytes(); - self.batch.merge(DATA_INDEX, &event_bytes)?; - } - - // update the documents using the appropriate IndexEvent - let event_bytes = WriteIndexEvent::UpdatedDocuments(&index).into_bytes(); - self.batch.merge(DATA_INDEX, &event_bytes)?; - - // === ranked map === - - if !removed_documents.is_empty() { - // update the ranked map using the appropriate RankedMapEvent - let event_bytes = WriteRankedMapEvent::RemovedDocuments(&removed_documents).into_bytes(); - self.batch.merge(DATA_RANKED_MAP, &event_bytes)?; - } - - // update the documents using the appropriate IndexEvent - let event_bytes = WriteRankedMapEvent::UpdatedDocuments(&self.documents_ranked_fields).into_bytes(); - self.batch.merge(DATA_RANKED_MAP, &event_bytes)?; - - Ok(self.batch) - } -} - -pub struct DocumentUpdate<'a> { - document_id: DocumentId, - inner: &'a mut RawUpdateBuilder, -} - -impl<'a> DocumentUpdate<'a> { - pub fn remove(&mut self) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Updated = self.inner.documents_update.entry(self.document_id).or_insert(Deleted) { - return Err(SerializerError::custom( - "This document has already been updated and cannot be removed in the same update" - )); - } - - let start = DocumentKey::new(self.document_id).with_attribute_min(); - let end = DocumentKey::new(self.document_id).with_attribute_max(); // FIXME max + 1 - self.inner.batch.delete_range(start.as_ref(), end.as_ref())?; - - Ok(()) - } - - pub fn insert_attribute_value(&mut self, attr: SchemaAttr, value: &[u8]) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted and cannot be updated in the same update" - )); - } - - let key = DocumentKeyAttr::new(self.document_id, attr); - self.inner.batch.put(key.as_ref(), &value)?; - - Ok(()) - } - - pub fn insert_doc_index(&mut self, token: Token, doc_index: DocIndex) -> Result<(), SerializerError> { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted and cannot be updated in the same update" - )); - } - - self.inner.indexed_words.entry(token).or_insert_with(Vec::new).push(doc_index); - - Ok(()) - } - - pub fn register_ranked_attribute( - &mut self, - attr: SchemaAttr, - number: Number, - ) -> Result<(), SerializerError> - { - use serde::ser::Error; - - if let Deleted = self.inner.documents_update.entry(self.document_id).or_insert(Updated) { - return Err(SerializerError::custom( - "This document has already been deleted, ranked attributes cannot be added in the same update" - )); - } - - self.inner.documents_ranked_fields.insert((self.document_id, attr), number); - - Ok(()) - } -} diff --git a/src/database/update/ranked_map_event.rs b/src/database/update/ranked_map_event.rs deleted file mode 100644 index 5a51f8799..000000000 --- a/src/database/update/ranked_map_event.rs +++ /dev/null @@ -1,58 +0,0 @@ -use std::error::Error; - -use byteorder::{ReadBytesExt, WriteBytesExt}; - -use crate::shared_data_cursor::{SharedDataCursor, FromSharedDataCursor}; -use crate::write_to_bytes::WriteToBytes; -use crate::database::RankedMap; -use crate::data::DocIds; - -pub enum WriteRankedMapEvent<'a> { - RemovedDocuments(&'a DocIds), - UpdatedDocuments(&'a RankedMap), -} - -impl<'a> WriteToBytes for WriteRankedMapEvent<'a> { - fn write_to_bytes(&self, bytes: &mut Vec) { - match self { - WriteRankedMapEvent::RemovedDocuments(doc_ids) => { - let _ = bytes.write_u8(0); - doc_ids.write_to_bytes(bytes); - }, - WriteRankedMapEvent::UpdatedDocuments(ranked_map) => { - let _ = bytes.write_u8(1); - bincode::serialize_into(bytes, ranked_map).unwrap() - } - } - } -} - -pub enum ReadRankedMapEvent { - RemovedDocuments(DocIds), - UpdatedDocuments(RankedMap), -} - -impl ReadRankedMapEvent { - pub fn updated_documents(self) -> Option { - use ReadRankedMapEvent::*; - match self { - RemovedDocuments(_) => None, - UpdatedDocuments(ranked_map) => Some(ranked_map), - } - } -} - -impl FromSharedDataCursor for ReadRankedMapEvent { - type Error = Box; - - fn from_shared_data_cursor(cursor: &mut SharedDataCursor) -> Result { - match cursor.read_u8()? { - 0 => DocIds::from_shared_data_cursor(cursor).map(ReadRankedMapEvent::RemovedDocuments), - 1 => { - let ranked_map = bincode::deserialize_from(cursor)?; - Ok(ReadRankedMapEvent::UpdatedDocuments(ranked_map)) - }, - _ => unreachable!(), - } - } -} diff --git a/src/database/view.rs b/src/database/view.rs deleted file mode 100644 index b1fbc0bdd..000000000 --- a/src/database/view.rs +++ /dev/null @@ -1,201 +0,0 @@ -use std::error::Error; -use std::path::Path; -use std::ops::Deref; -use std::{fmt, marker}; - -use rocksdb::rocksdb_options::{ReadOptions, EnvOptions, ColumnFamilyOptions}; -use rocksdb::rocksdb::{DB, DBVector, Snapshot, SeekKey, SstFileWriter}; -use serde::de::DeserializeOwned; - -use crate::database::{retrieve_data_schema, retrieve_data_index, retrieve_data_ranked_map, retrieve_config}; -use crate::database::serde::deserializer::Deserializer; -use crate::database::{DocumentKey, DocumentKeyAttr}; -use crate::rank::{QueryBuilder, FilterFunc}; -use crate::database::schema::Schema; -use crate::database::index::Index; -use crate::database::RankedMap; -use crate::database::Config; -use crate::DocumentId; - -pub struct DatabaseView -where D: Deref -{ - snapshot: Snapshot, - index: Index, - ranked_map: RankedMap, - schema: Schema, - config: Config, -} - -impl DatabaseView -where D: Deref -{ - pub fn new(snapshot: Snapshot) -> Result, Box> { - let schema = retrieve_data_schema(&snapshot)?; - let index = retrieve_data_index(&snapshot)?; - let ranked_map = retrieve_data_ranked_map(&snapshot)?; - let config = retrieve_config(&snapshot)?; - Ok(DatabaseView { snapshot, index, ranked_map, schema, config }) - } - - pub fn schema(&self) -> &Schema { - &self.schema - } - - pub fn index(&self) -> &Index { - &self.index - } - - pub fn ranked_map(&self) -> &RankedMap { - &self.ranked_map - } - - pub fn into_snapshot(self) -> Snapshot { - self.snapshot - } - - pub fn snapshot(&self) -> &Snapshot { - &self.snapshot - } - - pub fn config(&self) -> &Config { - &self.config - } - - pub fn get(&self, key: &[u8]) -> Result, Box> { - Ok(self.snapshot.get(key)?) - } - - pub fn dump_all>(&self, path: P) -> Result<(), Box> { - let path = path.as_ref().to_string_lossy(); - - let env_options = EnvOptions::new(); - let column_family_options = ColumnFamilyOptions::new(); - let mut file_writer = SstFileWriter::new(env_options, column_family_options); - file_writer.open(&path)?; - - let mut iter = self.snapshot.iter(); - iter.seek(SeekKey::Start); - - for (key, value) in &mut iter { - file_writer.put(&key, &value)?; - } - - file_writer.finish()?; - Ok(()) - } - - pub fn query_builder(&self) -> QueryBuilder { - QueryBuilder::new(self.index()) - } - - pub fn raw_field_by_document_id( - &self, - name: &str, - id: DocumentId - ) -> Result>, Box> - { - let attr = self.schema.attribute(name).ok_or("field not found")?; - let key = DocumentKeyAttr::new(id, attr); - let vector = self.snapshot.get(key.as_ref())?; - - Ok(vector.map(|v| v.to_vec())) - } - - pub fn document_by_id(&self, id: DocumentId) -> Result> - where T: DeserializeOwned, - { - let mut deserializer = Deserializer::new(&self.snapshot, &self.schema, id); - Ok(T::deserialize(&mut deserializer)?) - } - - pub fn documents_by_id(&self, ids: I) -> DocumentIter - where T: DeserializeOwned, - I: IntoIterator, - { - DocumentIter { - database_view: self, - document_ids: ids.into_iter(), - _phantom: marker::PhantomData, - } - } -} - -impl fmt::Debug for DatabaseView -where D: Deref -{ - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let mut options = ReadOptions::new(); - let lower = DocumentKey::new(DocumentId(0)); - options.set_iterate_lower_bound(lower.as_ref()); - - let mut iter = self.snapshot.iter_opt(options); - iter.seek(SeekKey::Start); - let iter = iter.map(|(key, _)| DocumentKeyAttr::from_bytes(&key)); - - if f.alternate() { - writeln!(f, "DatabaseView(")?; - } else { - write!(f, "DatabaseView(")?; - } - - self.schema.fmt(f)?; - - if f.alternate() { - writeln!(f, ",")?; - } else { - write!(f, ", ")?; - } - - f.debug_list().entries(iter).finish()?; - - write!(f, ")") - } -} - -// TODO this is just an iter::Map !!! -pub struct DocumentIter<'a, D, T, I> -where D: Deref -{ - database_view: &'a DatabaseView, - document_ids: I, - _phantom: marker::PhantomData, -} - -impl<'a, D, T, I> Iterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: Iterator, -{ - type Item = Result>; - - fn size_hint(&self) -> (usize, Option) { - self.document_ids.size_hint() - } - - fn next(&mut self) -> Option { - match self.document_ids.next() { - Some(id) => Some(self.database_view.document_by_id(id)), - None => None - } - } -} - -impl<'a, D, T, I> ExactSizeIterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: ExactSizeIterator + Iterator, -{ } - -impl<'a, D, T, I> DoubleEndedIterator for DocumentIter<'a, D, T, I> -where D: Deref, - T: DeserializeOwned, - I: DoubleEndedIterator + Iterator, -{ - fn next_back(&mut self) -> Option { - match self.document_ids.next_back() { - Some(id) => Some(self.database_view.document_by_id(id)), - None => None - } - } -} diff --git a/src/lib.rs b/src/lib.rs deleted file mode 100644 index 964de8f75..000000000 --- a/src/lib.rs +++ /dev/null @@ -1,136 +0,0 @@ -#![cfg_attr(feature = "nightly", feature(test))] - -pub mod automaton; -pub mod database; -pub mod data; -pub mod rank; -pub mod tokenizer; -mod common_words; -mod shared_data_cursor; -mod write_to_bytes; - -use serde_derive::{Serialize, Deserialize}; - -pub use rocksdb; - -pub use self::tokenizer::Tokenizer; -pub use self::common_words::CommonWords; - -pub fn is_cjk(c: char) -> bool { - (c >= '\u{2e80}' && c <= '\u{2eff}') || - (c >= '\u{2f00}' && c <= '\u{2fdf}') || - (c >= '\u{3040}' && c <= '\u{309f}') || - (c >= '\u{30a0}' && c <= '\u{30ff}') || - (c >= '\u{3100}' && c <= '\u{312f}') || - (c >= '\u{3200}' && c <= '\u{32ff}') || - (c >= '\u{3400}' && c <= '\u{4dbf}') || - (c >= '\u{4e00}' && c <= '\u{9fff}') || - (c >= '\u{f900}' && c <= '\u{faff}') -} - -/// Represent an internally generated document unique identifier. -/// -/// It is used to inform the database the document you want to deserialize. -/// Helpful for custom ranking. -#[derive(Serialize, Deserialize)] -#[derive(Debug, Copy, Clone, Eq, PartialEq, PartialOrd, Ord, Hash)] -pub struct DocumentId(u64); - -/// This structure represent the position of a word -/// in a document and its attributes. -/// -/// This is stored in the map, generated at index time, -/// extracted and interpreted at search time. -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -#[repr(C)] -pub struct DocIndex { - /// The document identifier where the word was found. - pub document_id: DocumentId, - - /// The attribute in the document where the word was found - /// along with the index in it. - pub attribute: u16, - pub word_index: u16, - - /// The position in bytes where the word was found - /// along with the length of it. - /// - /// It informs on the original word area in the text indexed - /// without needing to run the tokenizer again. - pub char_index: u16, - pub char_length: u16, -} - -/// This structure represent a matching word with informations -/// on the location of the word in the document. -/// -/// The order of the field is important because it defines -/// the way these structures are ordered between themselves. -/// -/// The word in itself is not important. -// TODO do data oriented programming ? very arrays ? -#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] -pub struct Match { - /// The word index in the query sentence. - /// Same as the `attribute_index` but for the query words. - /// - /// Used to retrieve the automaton that match this word. - pub query_index: u32, - - /// The distance the word has with the query word - /// (i.e. the Levenshtein distance). - pub distance: u8, - - /// The attribute in the document where the word was found - /// along with the index in it. - pub attribute: u16, - pub word_index: u16, - - /// Whether the word that match is an exact match or a prefix. - pub is_exact: bool, - - /// The position in bytes where the word was found - /// along with the length of it. - /// - /// It informs on the original word area in the text indexed - /// without needing to run the tokenizer again. - pub char_index: u16, - pub char_length: u16, -} - -impl Match { - pub fn zero() -> Self { - Match { - query_index: 0, - distance: 0, - attribute: 0, - word_index: 0, - is_exact: false, - char_index: 0, - char_length: 0, - } - } - - pub fn max() -> Self { - Match { - query_index: u32::max_value(), - distance: u8::max_value(), - attribute: u16::max_value(), - word_index: u16::max_value(), - is_exact: true, - char_index: u16::max_value(), - char_length: u16::max_value(), - } - } -} - -#[cfg(test)] -mod tests { - use super::*; - use std::mem; - - #[test] - fn docindex_mem_size() { - assert_eq!(mem::size_of::(), 16); - } -} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs deleted file mode 100644 index ed146c06f..000000000 --- a/src/tokenizer/mod.rs +++ /dev/null @@ -1,259 +0,0 @@ -use std::mem; -use crate::is_cjk; -use self::Separator::*; - -pub trait TokenizerBuilder { - fn build<'a>(&self, text: &'a str) -> Box> + 'a>; -} - -pub struct DefaultBuilder; - -impl DefaultBuilder { - pub fn new() -> DefaultBuilder { - DefaultBuilder - } -} - -#[derive(Debug, PartialEq, Eq)] -pub struct Token<'a> { - pub word: &'a str, - pub word_index: usize, - pub char_index: usize, -} - -impl TokenizerBuilder for DefaultBuilder { - fn build<'a>(&self, text: &'a str) -> Box> + 'a> { - Box::new(Tokenizer::new(text)) - } -} - -pub struct Tokenizer<'a> { - word_index: usize, - char_index: usize, - inner: &'a str, -} - -impl<'a> Tokenizer<'a> { - pub fn new(string: &str) -> Tokenizer { - let mut char_advance = 0; - let mut index_advance = 0; - for (n, (i, c)) in string.char_indices().enumerate() { - char_advance = n; - index_advance = i; - if detect_separator(c).is_none() { break } - } - - Tokenizer { - word_index: 0, - char_index: char_advance, - inner: &string[index_advance..], - } - } -} - -#[derive(Debug, Clone, Copy)] -enum Separator { - Short, - Long, -} - -impl Separator { - fn add(self, add: Separator) -> Separator { - match (self, add) { - (_, Long) => Long, - (Short, Short) => Short, - (Long, Short) => Long, - } - } - - fn to_usize(self) -> usize { - match self { - Short => 1, - Long => 8, - } - } -} - -fn detect_separator(c: char) -> Option { - match c { - '.' | ';' | ',' | '!' | '?' | '-' | '(' | ')' => Some(Long), - ' ' | '\'' | '"' => Some(Short), - _ => None, - } -} - -impl<'a> Iterator for Tokenizer<'a> { - type Item = Token<'a>; - - fn next(&mut self) -> Option { - let mut start_word = None; - let mut distance = None; - - for (i, c) in self.inner.char_indices() { - match detect_separator(c) { - Some(sep) => { - if let Some(start_word) = start_word { - let (prefix, tail) = self.inner.split_at(i); - let (spaces, word) = prefix.split_at(start_word); - - self.inner = tail; - self.char_index += spaces.chars().count(); - self.word_index += distance.map(Separator::to_usize).unwrap_or(0); - - let token = Token { - word: word, - word_index: self.word_index, - char_index: self.char_index, - }; - - self.char_index += word.chars().count(); - return Some(token) - } - - distance = Some(distance.map_or(sep, |s| s.add(sep))); - }, - None => { - // if this is a Chinese, a Japanese or a Korean character - // See - if is_cjk(c) { - match start_word { - Some(start_word) => { - let (prefix, tail) = self.inner.split_at(i); - let (spaces, word) = prefix.split_at(start_word); - - self.inner = tail; - self.char_index += spaces.chars().count(); - self.word_index += distance.map(Separator::to_usize).unwrap_or(0); - - let token = Token { - word: word, - word_index: self.word_index, - char_index: self.char_index, - }; - - self.word_index += 1; - self.char_index += word.chars().count(); - - return Some(token) - }, - None => { - let (prefix, tail) = self.inner.split_at(i + c.len_utf8()); - let (spaces, word) = prefix.split_at(i); - - self.inner = tail; - self.char_index += spaces.chars().count(); - self.word_index += distance.map(Separator::to_usize).unwrap_or(0); - - let token = Token { - word: word, - word_index: self.word_index, - char_index: self.char_index, - }; - - if tail.chars().next().and_then(detect_separator).is_none() { - self.word_index += 1; - } - self.char_index += 1; - - return Some(token) - } - } - } - - if start_word.is_none() { start_word = Some(i) } - }, - } - } - - if let Some(start_word) = start_word { - let prefix = mem::replace(&mut self.inner, ""); - let (spaces, word) = prefix.split_at(start_word); - - let token = Token { - word: word, - word_index: self.word_index + distance.map(Separator::to_usize).unwrap_or(0), - char_index: self.char_index + spaces.chars().count(), - }; - return Some(token) - } - - None - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn easy() { - let mut tokenizer = Tokenizer::new("salut"); - - assert_eq!(tokenizer.next(), Some(Token { word: "salut", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), None); - - let mut tokenizer = Tokenizer::new("yo "); - - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), None); - } - - #[test] - fn hard() { - let mut tokenizer = Tokenizer::new(" .? yo lolo. aïe (ouch)"); - - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 1, char_index: 7 })); - assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 13 })); - assert_eq!(tokenizer.next(), Some(Token { word: "ouch", word_index: 17, char_index: 18 })); - assert_eq!(tokenizer.next(), None); - - let mut tokenizer = Tokenizer::new("yo ! lolo ? wtf - lol . aïe ,"); - - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); - assert_eq!(tokenizer.next(), Some(Token { word: "wtf", word_index: 16, char_index: 12 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 18 })); - assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 32, char_index: 24 })); - assert_eq!(tokenizer.next(), None); - } - - #[test] - fn hard_long_chars() { - let mut tokenizer = Tokenizer::new(" .? yo 😂. aïe"); - - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 4 })); - assert_eq!(tokenizer.next(), Some(Token { word: "😂", word_index: 1, char_index: 7 })); - assert_eq!(tokenizer.next(), Some(Token { word: "aïe", word_index: 9, char_index: 10 })); - assert_eq!(tokenizer.next(), None); - - let mut tokenizer = Tokenizer::new("yo ! lolo ? 😱 - lol . 😣 ,"); - - assert_eq!(tokenizer.next(), Some(Token { word: "yo", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolo", word_index: 8, char_index: 5 })); - assert_eq!(tokenizer.next(), Some(Token { word: "😱", word_index: 16, char_index: 12 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lol", word_index: 24, char_index: 16 })); - assert_eq!(tokenizer.next(), Some(Token { word: "😣", word_index: 32, char_index: 22 })); - assert_eq!(tokenizer.next(), None); - } - - #[test] - fn hard_kanjis() { - let mut tokenizer = Tokenizer::new("\u{2ec4}lolilol\u{2ec7}"); - - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 1, char_index: 1 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 2, char_index: 8 })); - assert_eq!(tokenizer.next(), None); - - let mut tokenizer = Tokenizer::new("\u{2ec4}\u{2ed3}\u{2ef2} lolilol - hello \u{2ec7}"); - - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec4}", word_index: 0, char_index: 0 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ed3}", word_index: 1, char_index: 1 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ef2}", word_index: 2, char_index: 2 })); - assert_eq!(tokenizer.next(), Some(Token { word: "lolilol", word_index: 3, char_index: 4 })); - assert_eq!(tokenizer.next(), Some(Token { word: "hello", word_index: 11, char_index: 14 })); - assert_eq!(tokenizer.next(), Some(Token { word: "\u{2ec7}", word_index: 12, char_index: 23 })); - assert_eq!(tokenizer.next(), None); - } -}